view release on metacpan or search on metacpan
simdjson.cpp view on Meta::CPAN
// All includes referencing simdjson headers *not* under simdjson/generic must be here!
// Otherwise, amalgamation will fail.
/* skipped duplicate #include "simdjson/base.h" */
/* including simdjson/implementation.h: #include "simdjson/implementation.h" */
/* begin file simdjson/implementation.h */
#ifndef SIMDJSON_IMPLEMENTATION_H
#define SIMDJSON_IMPLEMENTATION_H
/* including simdjson/internal/atomic_ptr.h: #include "simdjson/internal/atomic_ptr.h" */
/* begin file simdjson/internal/atomic_ptr.h */
#ifndef SIMDJSON_INTERNAL_ATOMIC_PTR_H
#define SIMDJSON_INTERNAL_ATOMIC_PTR_H
/* skipped duplicate #include "simdjson/base.h" */
#include <atomic>
namespace simdjson {
namespace internal {
template<typename T>
class atomic_ptr {
public:
atomic_ptr(T *_ptr) : ptr{_ptr} {}
operator const T*() const { return ptr.load(); }
const T& operator*() const { return *ptr; }
const T* operator->() const { return ptr.load(); }
operator T*() { return ptr.load(); }
T& operator*() { return *ptr; }
T* operator->() { return ptr.load(); }
atomic_ptr& operator=(T *_ptr) { ptr = _ptr; return *this; }
private:
std::atomic<T*> ptr;
};
} // namespace internal
} // namespace simdjson
#endif // SIMDJSON_INTERNAL_ATOMIC_PTR_H
/* end file simdjson/internal/atomic_ptr.h */
/* including simdjson/internal/dom_parser_implementation.h: #include "simdjson/internal/dom_parser_implementation.h" */
/* begin file simdjson/internal/dom_parser_implementation.h */
#ifndef SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H
#define SIMDJSON_INTERNAL_DOM_PARSER_IMPLEMENTATION_H
/* skipped duplicate #include "simdjson/base.h" */
/* skipped duplicate #include "simdjson/error.h" */
#include <memory>
namespace simdjson {
namespace dom {
class document;
} // namespace dom
/**
* This enum is used with the dom_parser_implementation::stage1 function.
* 1) The regular mode expects a fully formed JSON document.
* 2) The streaming_partial mode expects a possibly truncated
* input within a stream on JSON documents.
* 3) The stream_final mode allows us to truncate final
* unterminated strings. It is useful in conjunction with streaming_partial.
*/
enum class stage1_mode { regular, streaming_partial, streaming_final};
/**
* Returns true if mode == streaming_partial or mode == streaming_final
*/
inline bool is_streaming(stage1_mode mode) {
// performance note: it is probably faster to check that mode is different
// from regular than checking that it is either streaming_partial or streaming_final.
return (mode != stage1_mode::regular);
// return (mode == stage1_mode::streaming_partial || mode == stage1_mode::streaming_final);
}
namespace internal {
/**
* An implementation of simdjson's DOM parser for a particular CPU architecture.
*
* This class is expected to be accessed only by pointer, and never move in memory (though the
* pointer can move).
*/
class dom_parser_implementation {
public:
/**
* @private For internal implementation use
*
* Run a full JSON parse on a single document (stage1 + stage2).
*
* Guaranteed only to be called when capacity > document length.
*
* Overridden by each implementation.
*
* @param buf The json document to parse. *MUST* be allocated up to len + SIMDJSON_PADDING bytes.
* @param len The length of the json document.
* @return The error code, or SUCCESS if there was no error.
*/
simdjson_warn_unused virtual error_code parse(const uint8_t *buf, size_t len, dom::document &doc) noexcept = 0;
/**
* @private For internal implementation use
*
* Stage 1 of the document parser.
*
* Guaranteed only to be called when capacity > document length.
*
* Overridden by each implementation.
*
* @param buf The json document to parse.
* @param len The length of the json document.
* @param streaming Whether this is being called by parser::parse_many.
* @return The error code, or SUCCESS if there was no error.
*/
simdjson_warn_unused virtual error_code stage1(const uint8_t *buf, size_t len, stage1_mode streaming) noexcept = 0;
/**
* @private For internal implementation use
*
* Stage 2 of the document parser.
*
* Called after stage1().
*
* Overridden by each implementation.
*
* @param doc The document to output to.
* @return The error code, or SUCCESS if there was no error.
*/
simdjson_warn_unused virtual error_code stage2(dom::document &doc) noexcept = 0;
/**
* @private For internal implementation use
*
* Stage 2 of the document parser for parser::parse_many.
*
* Guaranteed only to be called after stage1().
* Overridden by each implementation.
*
* @param doc The document to output to.
* @return The error code, SUCCESS if there was no error, or EMPTY if all documents have been parsed.
*/
simdjson_warn_unused virtual error_code stage2_next(dom::document &doc) noexcept = 0;
/**
* Unescape a valid UTF-8 string from src to dst, stopping at a final unescaped quote. There
* must be an unescaped quote terminating the string. It returns the final output
* position as pointer. In case of error (e.g., the string has bad escaped codes),
* then null_ptr is returned. It is assumed that the output buffer is large
* enough. E.g., if src points at 'joe"', then dst needs to have four free bytes +
* SIMDJSON_PADDING bytes.
*
* Overridden by each implementation.
*
* @param str pointer to the beginning of a valid UTF-8 JSON string, must end with an unescaped quote.
* @param dst pointer to a destination buffer, it must point a region in memory of sufficient size.
* @param allow_replacement whether we allow a replacement character when the UTF-8 contains unmatched surrogate pairs.
* @return end of the of the written region (exclusive) or nullptr in case of error.
*/
simdjson_warn_unused virtual uint8_t *parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) const noexcept = 0;
/**
* Unescape a NON-valid UTF-8 string from src to dst, stopping at a final unescaped quote. There
* must be an unescaped quote terminating the string. It returns the final output
* position as pointer. In case of error (e.g., the string has bad escaped codes),
* then null_ptr is returned. It is assumed that the output buffer is large
* enough. E.g., if src points at 'joe"', then dst needs to have four free bytes +
* SIMDJSON_PADDING bytes.
*
* Overridden by each implementation.
*
* @param str pointer to the beginning of a possibly invalid UTF-8 JSON string, must end with an unescaped quote.
* @param dst pointer to a destination buffer, it must point a region in memory of sufficient size.
* @return end of the of the written region (exclusive) or nullptr in case of error.
*/
simdjson_warn_unused virtual uint8_t *parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept = 0;
simdjson.cpp view on Meta::CPAN
private:
simdjson_inline json_structural_indexer(uint32_t *structural_indexes);
template<size_t STEP_SIZE>
simdjson_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
simdjson_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial);
json_scanner scanner{};
utf8_checker checker{};
bit_indexer indexer;
uint64_t prev_structurals = 0;
uint64_t unescaped_chars_error = 0;
};
simdjson_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
// Skip the last character if it is partial
simdjson_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
if (simdjson_unlikely(len < 3)) {
switch (len) {
case 2:
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
return len;
case 1:
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
return len;
case 0:
return len;
}
}
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 1 byte left
if (buf[len-3] >= 0xf0) { return len-3; } // 4-byte characters with only 3 bytes left
return len;
}
//
// PERF NOTES:
// We pipe 2 inputs through these stages:
// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
// The output of step 1 depends entirely on this information. These functions don't quite use
// up enough CPU: the second half of the functions is highly serial, only using 1 execution core
// at a time. The second input's scans has some dependency on the first ones finishing it, but
// they can make a lot of progress before they need that information.
// 3. Step 1 does not use enough capacity, so we run some extra stuff while we're waiting for that
// to finish: utf-8 checks and generating the output from the last iteration.
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
// We guard the rest of the code so that we can assume that len > 0 throughout.
if (len == 0) { return EMPTY; }
if (is_streaming(partial)) {
len = trim_partial_utf8(buf, len);
// If you end up with an empty window after trimming
// the partial UTF-8 bytes, then chances are good that you
// have an UTF-8 formatting error.
if(len == 0) { return UTF8_ERROR; }
}
buf_block_reader<STEP_SIZE> reader(buf, len);
json_structural_indexer indexer(parser.structural_indexes.get());
// Read all but the last block
while (reader.has_full_block()) {
indexer.step<STEP_SIZE>(reader.full_block(), reader);
}
// Take care of the last block (will always be there unless file is empty which is
// not supposed to happen.)
uint8_t block[STEP_SIZE];
if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; }
indexer.step<STEP_SIZE>(block, reader);
return indexer.finish(parser, reader.block_index(), len, partial);
}
template<>
simdjson_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
simd::simd8x64<uint8_t> in_1(block);
simd::simd8x64<uint8_t> in_2(block+64);
json_block block_1 = scanner.next(in_1);
json_block block_2 = scanner.next(in_2);
this->next(in_1, block_1, reader.block_index());
this->next(in_2, block_2, reader.block_index()+64);
reader.advance();
}
template<>
simdjson_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
simd::simd8x64<uint8_t> in_1(block);
json_block block_1 = scanner.next(in_1);
this->next(in_1, block_1, reader.block_index());
reader.advance();
}
simdjson_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
uint64_t unescaped = in.lteq(0x1F);
#if SIMDJSON_UTF8VALIDATION
checker.check_next_input(in);
#endif
indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
prev_structurals = block.structural_start();
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
// Write out the final iteration's structurals
indexer.write(uint32_t(idx-64), prev_structurals);
error_code error = scanner.finish();
// We deliberately break down the next expression so that it is
// human readable.
const bool should_we_exit = is_streaming(partial) ?
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
: (error != SUCCESS); // if partial is false, we must have SUCCESS
const bool have_unclosed_string = (error == UNCLOSED_STRING);
if (simdjson_unlikely(should_we_exit)) { return error; }
if (unescaped_chars_error) {
return UNESCAPED_CHARS;
}
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
/***
* The On-Demand API requires special padding.
*
* This is related to https://github.com/simdjson/simdjson/issues/906
* Basically, we want to make sure that if the parsing continues beyond the last (valid)
* structural character, it quickly stops.
* Only three structural characters can be repeated without triggering an error in JSON: [,] and }.
* We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
* continues, then it must be [,] or }.
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
* It can't be a comma, a colon or any simple value. So the only way we could continue is
* if the repeated character is [. But if so, the document must start with [. But if the document
* starts with [, it should end with ]. If we enforce that rule, then we would get
* ][[ which is invalid.
*
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
* R"({ "a": [,,)"
**/
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
parser.next_structural_index = 0;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
return EMPTY;
}
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
return UNEXPECTED_ERROR;
}
if (partial == stage1_mode::streaming_partial) {
// If we have an unclosed string, then the last structural
// will be the quote and we want to make sure to omit it.
if(have_unclosed_string) {
parser.n_structural_indexes--;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
}
// We truncate the input to the end of the last complete document (or zero).
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
if(parser.structural_indexes[0] == 0) {
// If the buffer is partial and we started at index 0 but the document is
// incomplete, it's too big to parse.
return CAPACITY;
} else {
// It is possible that the document could be parsed, we just had a lot
// of white space.
parser.n_structural_indexes = 0;
return EMPTY;
}
}
parser.n_structural_indexes = new_structural_indexes;
} else if (partial == stage1_mode::streaming_final) {
if(have_unclosed_string) { parser.n_structural_indexes--; }
// We truncate the input to the end of the last complete document (or zero).
// Because partial == stage1_mode::streaming_final, it means that we may
// silently ignore trailing garbage. Though it sounds bad, we do it
// deliberately because many people who have streams of JSON documents
// will truncate them for processing. E.g., imagine that you are uncompressing
// the data from a size file or receiving it in chunks from the network. You
// may not know where exactly the last document will be. Meanwhile the
// document_stream instances allow people to know the JSON documents they are
// parsing (see the iterator.source() method).
parser.n_structural_indexes = find_next_document_index(parser);
// We store the initial n_structural_indexes so that the client can see
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
// then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
// otherwise, it will copy some prior index.
parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
// This next line is critical, do not change it unless you understand what you are
// doing.
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
// We tolerate an unclosed string at the very end of the stream. Indeed, users
// often load their data in bulk without being careful and they want us to ignore
// the trailing garbage.
return EMPTY;
}
}
checker.check_eof();
return checker.errors();
}
} // namespace stage1
} // unnamed namespace
} // namespace arm64
} // namespace simdjson
// Clear CUSTOM_BIT_INDEXER so other implementations can set it if they need to.
#undef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H
/* end file generic/stage1/json_structural_indexer.h for arm64 */
/* including generic/stage1/utf8_validator.h for arm64: #include <generic/stage1/utf8_validator.h> */
/* begin file generic/stage1/utf8_validator.h for arm64 */
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H */
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
/* amalgamation skipped (editor-only): #include <generic/stage1/utf8_lookup4_algorithm.h> */
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
namespace simdjson {
namespace arm64 {
namespace {
namespace stage1 {
/**
* Validates that the string is actual UTF-8.
*/
template<class checker>
bool generic_validate_utf8(const uint8_t * input, size_t length) {
checker c{};
buf_block_reader<64> reader(input, length);
simdjson.cpp view on Meta::CPAN
// do not pick it up.
uint64_t op = simd8x64<bool>(
v.chunks[0].any_bits_set(0x7),
v.chunks[1].any_bits_set(0x7),
v.chunks[2].any_bits_set(0x7),
v.chunks[3].any_bits_set(0x7)
).to_bitmask();
uint64_t whitespace = simd8x64<bool>(
v.chunks[0].any_bits_set(0x18),
v.chunks[1].any_bits_set(0x18),
v.chunks[2].any_bits_set(0x18),
v.chunks[3].any_bits_set(0x18)
).to_bitmask();
return { whitespace, op };
}
simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input) {
simd8<uint8_t> bits = input.reduce_or();
return bits.max_val() < 0x80u;
}
simdjson_unused simdjson_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<bool> is_second_byte = prev1 >= uint8_t(0xc0u);
simd8<bool> is_third_byte = prev2 >= uint8_t(0xe0u);
simd8<bool> is_fourth_byte = prev3 >= uint8_t(0xf0u);
// Use ^ instead of | for is_*_byte, because ^ is commutative, and the caller is using ^ as well.
// This will work fine because we only have to report errors for cases with 0-1 lead bytes.
// Multiple lead bytes implies 2 overlapping multibyte characters, and if that happens, there is
// guaranteed to be at least *one* lead byte that is part of only 1 other multibyte character.
// The error will be detected there.
return is_second_byte ^ is_third_byte ^ is_fourth_byte;
}
simdjson_inline simd8<uint8_t> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be >= 0x80
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be >= 0x80
return is_third_byte | is_fourth_byte;
}
} // unnamed namespace
} // namespace arm64
} // namespace simdjson
//
// Stage 2
//
//
// Implementation-specific overrides
//
namespace simdjson {
namespace arm64 {
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
this->buf = _buf;
this->len = _len;
return arm64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
}
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return arm64::stage1::generic_validate_utf8(buf,len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<true>(*this, _doc);
}
SIMDJSON_NO_SANITIZE_MEMORY
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) const noexcept {
return arm64::stringparsing::parse_string(src, dst, allow_replacement);
}
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept {
return arm64::stringparsing::parse_wobbly_string(src, dst);
}
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
auto error = stage1(_buf, _len, stage1_mode::regular);
if (error) { return error; }
return stage2(_doc);
}
} // namespace arm64
} // namespace simdjson
/* including simdjson/arm64/end.h: #include <simdjson/arm64/end.h> */
/* begin file simdjson/arm64/end.h */
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
/* amalgamation skipped (editor-only): #include "simdjson/arm64/base.h" */
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
#undef SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT
/* undefining SIMDJSON_IMPLEMENTATION from "arm64" */
#undef SIMDJSON_IMPLEMENTATION
/* end file simdjson/arm64/end.h */
#endif // SIMDJSON_SRC_ARM64_CPP
/* end file arm64.cpp */
#endif
#if SIMDJSON_IMPLEMENTATION_HASWELL
/* including haswell.cpp: #include <haswell.cpp> */
/* begin file haswell.cpp */
#ifndef SIMDJSON_SRC_HASWELL_CPP
#define SIMDJSON_SRC_HASWELL_CPP
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
/* amalgamation skipped (editor-only): #include <base.h> */
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
/* including simdjson/haswell.h: #include <simdjson/haswell.h> */
/* begin file simdjson/haswell.h */
#ifndef SIMDJSON_HASWELL_H
#define SIMDJSON_HASWELL_H
simdjson.cpp view on Meta::CPAN
private:
simdjson_inline json_structural_indexer(uint32_t *structural_indexes);
template<size_t STEP_SIZE>
simdjson_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
simdjson_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial);
json_scanner scanner{};
utf8_checker checker{};
bit_indexer indexer;
uint64_t prev_structurals = 0;
uint64_t unescaped_chars_error = 0;
};
simdjson_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
// Skip the last character if it is partial
simdjson_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
if (simdjson_unlikely(len < 3)) {
switch (len) {
case 2:
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
return len;
case 1:
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
return len;
case 0:
return len;
}
}
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 1 byte left
if (buf[len-3] >= 0xf0) { return len-3; } // 4-byte characters with only 3 bytes left
return len;
}
//
// PERF NOTES:
// We pipe 2 inputs through these stages:
// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
// The output of step 1 depends entirely on this information. These functions don't quite use
// up enough CPU: the second half of the functions is highly serial, only using 1 execution core
// at a time. The second input's scans has some dependency on the first ones finishing it, but
// they can make a lot of progress before they need that information.
// 3. Step 1 does not use enough capacity, so we run some extra stuff while we're waiting for that
// to finish: utf-8 checks and generating the output from the last iteration.
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
// We guard the rest of the code so that we can assume that len > 0 throughout.
if (len == 0) { return EMPTY; }
if (is_streaming(partial)) {
len = trim_partial_utf8(buf, len);
// If you end up with an empty window after trimming
// the partial UTF-8 bytes, then chances are good that you
// have an UTF-8 formatting error.
if(len == 0) { return UTF8_ERROR; }
}
buf_block_reader<STEP_SIZE> reader(buf, len);
json_structural_indexer indexer(parser.structural_indexes.get());
// Read all but the last block
while (reader.has_full_block()) {
indexer.step<STEP_SIZE>(reader.full_block(), reader);
}
// Take care of the last block (will always be there unless file is empty which is
// not supposed to happen.)
uint8_t block[STEP_SIZE];
if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; }
indexer.step<STEP_SIZE>(block, reader);
return indexer.finish(parser, reader.block_index(), len, partial);
}
template<>
simdjson_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
simd::simd8x64<uint8_t> in_1(block);
simd::simd8x64<uint8_t> in_2(block+64);
json_block block_1 = scanner.next(in_1);
json_block block_2 = scanner.next(in_2);
this->next(in_1, block_1, reader.block_index());
this->next(in_2, block_2, reader.block_index()+64);
reader.advance();
}
template<>
simdjson_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
simd::simd8x64<uint8_t> in_1(block);
json_block block_1 = scanner.next(in_1);
this->next(in_1, block_1, reader.block_index());
reader.advance();
}
simdjson_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
uint64_t unescaped = in.lteq(0x1F);
#if SIMDJSON_UTF8VALIDATION
checker.check_next_input(in);
#endif
indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
prev_structurals = block.structural_start();
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
// Write out the final iteration's structurals
indexer.write(uint32_t(idx-64), prev_structurals);
error_code error = scanner.finish();
// We deliberately break down the next expression so that it is
// human readable.
const bool should_we_exit = is_streaming(partial) ?
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
: (error != SUCCESS); // if partial is false, we must have SUCCESS
const bool have_unclosed_string = (error == UNCLOSED_STRING);
if (simdjson_unlikely(should_we_exit)) { return error; }
if (unescaped_chars_error) {
return UNESCAPED_CHARS;
}
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
/***
* The On-Demand API requires special padding.
*
* This is related to https://github.com/simdjson/simdjson/issues/906
* Basically, we want to make sure that if the parsing continues beyond the last (valid)
* structural character, it quickly stops.
* Only three structural characters can be repeated without triggering an error in JSON: [,] and }.
* We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
* continues, then it must be [,] or }.
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
* It can't be a comma, a colon or any simple value. So the only way we could continue is
* if the repeated character is [. But if so, the document must start with [. But if the document
* starts with [, it should end with ]. If we enforce that rule, then we would get
* ][[ which is invalid.
*
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
* R"({ "a": [,,)"
**/
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
parser.next_structural_index = 0;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
return EMPTY;
}
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
return UNEXPECTED_ERROR;
}
if (partial == stage1_mode::streaming_partial) {
// If we have an unclosed string, then the last structural
// will be the quote and we want to make sure to omit it.
if(have_unclosed_string) {
parser.n_structural_indexes--;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
}
// We truncate the input to the end of the last complete document (or zero).
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
if(parser.structural_indexes[0] == 0) {
// If the buffer is partial and we started at index 0 but the document is
// incomplete, it's too big to parse.
return CAPACITY;
} else {
// It is possible that the document could be parsed, we just had a lot
// of white space.
parser.n_structural_indexes = 0;
return EMPTY;
}
}
parser.n_structural_indexes = new_structural_indexes;
} else if (partial == stage1_mode::streaming_final) {
if(have_unclosed_string) { parser.n_structural_indexes--; }
// We truncate the input to the end of the last complete document (or zero).
// Because partial == stage1_mode::streaming_final, it means that we may
// silently ignore trailing garbage. Though it sounds bad, we do it
// deliberately because many people who have streams of JSON documents
// will truncate them for processing. E.g., imagine that you are uncompressing
// the data from a size file or receiving it in chunks from the network. You
// may not know where exactly the last document will be. Meanwhile the
// document_stream instances allow people to know the JSON documents they are
// parsing (see the iterator.source() method).
parser.n_structural_indexes = find_next_document_index(parser);
// We store the initial n_structural_indexes so that the client can see
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
// then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
// otherwise, it will copy some prior index.
parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
// This next line is critical, do not change it unless you understand what you are
// doing.
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
// We tolerate an unclosed string at the very end of the stream. Indeed, users
// often load their data in bulk without being careful and they want us to ignore
// the trailing garbage.
return EMPTY;
}
}
checker.check_eof();
return checker.errors();
}
} // namespace stage1
} // unnamed namespace
} // namespace haswell
} // namespace simdjson
// Clear CUSTOM_BIT_INDEXER so other implementations can set it if they need to.
#undef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H
/* end file generic/stage1/json_structural_indexer.h for haswell */
/* including generic/stage1/utf8_validator.h for haswell: #include <generic/stage1/utf8_validator.h> */
/* begin file generic/stage1/utf8_validator.h for haswell */
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H */
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
/* amalgamation skipped (editor-only): #include <generic/stage1/utf8_lookup4_algorithm.h> */
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
namespace simdjson {
namespace haswell {
namespace {
namespace stage1 {
/**
* Validates that the string is actual UTF-8.
*/
template<class checker>
bool generic_validate_utf8(const uint8_t * input, size_t length) {
checker c{};
buf_block_reader<64> reader(input, length);
simdjson.cpp view on Meta::CPAN
);
// We compute whitespace and op separately. If later code only uses one or the
// other, given the fact that all functions are aggressively inlined, we can
// hope that useless computations will be omitted. This is namely case when
// minifying (we only need whitespace).
const uint64_t whitespace = in.eq({
_mm256_shuffle_epi8(whitespace_table, in.chunks[0]),
_mm256_shuffle_epi8(whitespace_table, in.chunks[1])
});
// Turn [ and ] into { and }
const simd8x64<uint8_t> curlified{
in.chunks[0] | 0x20,
in.chunks[1] | 0x20
};
const uint64_t op = curlified.eq({
_mm256_shuffle_epi8(op_table, in.chunks[0]),
_mm256_shuffle_epi8(op_table, in.chunks[1])
});
return { whitespace, op };
}
simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input) {
return input.reduce_or().is_ascii();
}
simdjson_unused simdjson_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<uint8_t> is_second_byte = prev1.saturating_sub(0xc0u-1); // Only 11______ will be > 0
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-1); // Only 111_____ will be > 0
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-1); // Only 1111____ will be > 0
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
}
simdjson_inline simd8<uint8_t> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be >= 0x80
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be >= 0x80
return is_third_byte | is_fourth_byte;
}
} // unnamed namespace
} // namespace haswell
} // namespace simdjson
//
// Stage 2
//
//
// Implementation-specific overrides
//
namespace simdjson {
namespace haswell {
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
this->buf = _buf;
this->len = _len;
return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
}
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return haswell::stage1::generic_validate_utf8(buf,len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<true>(*this, _doc);
}
SIMDJSON_NO_SANITIZE_MEMORY
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool replacement_char) const noexcept {
return haswell::stringparsing::parse_string(src, dst, replacement_char);
}
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept {
return haswell::stringparsing::parse_wobbly_string(src, dst);
}
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
auto error = stage1(_buf, _len, stage1_mode::regular);
if (error) { return error; }
return stage2(_doc);
}
} // namespace haswell
} // namespace simdjson
/* including simdjson/haswell/end.h: #include <simdjson/haswell/end.h> */
/* begin file simdjson/haswell/end.h */
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
/* amalgamation skipped (editor-only): #include "simdjson/haswell/base.h" */
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
#if !SIMDJSON_CAN_ALWAYS_RUN_HASWELL
SIMDJSON_UNTARGET_REGION
#endif
/* undefining SIMDJSON_IMPLEMENTATION from "haswell" */
#undef SIMDJSON_IMPLEMENTATION
/* end file simdjson/haswell/end.h */
#endif // SIMDJSON_SRC_HASWELL_CPP
/* end file haswell.cpp */
#endif
#if SIMDJSON_IMPLEMENTATION_ICELAKE
/* including icelake.cpp: #include <icelake.cpp> */
/* begin file icelake.cpp */
#ifndef SIMDJSON_SRC_ICELAKE_CPP
#define SIMDJSON_SRC_ICELAKE_CPP
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
/* amalgamation skipped (editor-only): #include <base.h> */
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
/* including simdjson/icelake.h: #include <simdjson/icelake.h> */
simdjson.cpp view on Meta::CPAN
private:
simdjson_inline json_structural_indexer(uint32_t *structural_indexes);
template<size_t STEP_SIZE>
simdjson_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
simdjson_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial);
json_scanner scanner{};
utf8_checker checker{};
bit_indexer indexer;
uint64_t prev_structurals = 0;
uint64_t unescaped_chars_error = 0;
};
simdjson_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
// Skip the last character if it is partial
simdjson_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
if (simdjson_unlikely(len < 3)) {
switch (len) {
case 2:
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
return len;
case 1:
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
return len;
case 0:
return len;
}
}
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 1 byte left
if (buf[len-3] >= 0xf0) { return len-3; } // 4-byte characters with only 3 bytes left
return len;
}
//
// PERF NOTES:
// We pipe 2 inputs through these stages:
// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
// The output of step 1 depends entirely on this information. These functions don't quite use
// up enough CPU: the second half of the functions is highly serial, only using 1 execution core
// at a time. The second input's scans has some dependency on the first ones finishing it, but
// they can make a lot of progress before they need that information.
// 3. Step 1 does not use enough capacity, so we run some extra stuff while we're waiting for that
// to finish: utf-8 checks and generating the output from the last iteration.
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
// We guard the rest of the code so that we can assume that len > 0 throughout.
if (len == 0) { return EMPTY; }
if (is_streaming(partial)) {
len = trim_partial_utf8(buf, len);
// If you end up with an empty window after trimming
// the partial UTF-8 bytes, then chances are good that you
// have an UTF-8 formatting error.
if(len == 0) { return UTF8_ERROR; }
}
buf_block_reader<STEP_SIZE> reader(buf, len);
json_structural_indexer indexer(parser.structural_indexes.get());
// Read all but the last block
while (reader.has_full_block()) {
indexer.step<STEP_SIZE>(reader.full_block(), reader);
}
// Take care of the last block (will always be there unless file is empty which is
// not supposed to happen.)
uint8_t block[STEP_SIZE];
if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; }
indexer.step<STEP_SIZE>(block, reader);
return indexer.finish(parser, reader.block_index(), len, partial);
}
template<>
simdjson_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
simd::simd8x64<uint8_t> in_1(block);
simd::simd8x64<uint8_t> in_2(block+64);
json_block block_1 = scanner.next(in_1);
json_block block_2 = scanner.next(in_2);
this->next(in_1, block_1, reader.block_index());
this->next(in_2, block_2, reader.block_index()+64);
reader.advance();
}
template<>
simdjson_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
simd::simd8x64<uint8_t> in_1(block);
json_block block_1 = scanner.next(in_1);
this->next(in_1, block_1, reader.block_index());
reader.advance();
}
simdjson_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
uint64_t unescaped = in.lteq(0x1F);
#if SIMDJSON_UTF8VALIDATION
checker.check_next_input(in);
#endif
indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
prev_structurals = block.structural_start();
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
// Write out the final iteration's structurals
indexer.write(uint32_t(idx-64), prev_structurals);
error_code error = scanner.finish();
// We deliberately break down the next expression so that it is
// human readable.
const bool should_we_exit = is_streaming(partial) ?
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
: (error != SUCCESS); // if partial is false, we must have SUCCESS
const bool have_unclosed_string = (error == UNCLOSED_STRING);
if (simdjson_unlikely(should_we_exit)) { return error; }
if (unescaped_chars_error) {
return UNESCAPED_CHARS;
}
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
/***
* The On-Demand API requires special padding.
*
* This is related to https://github.com/simdjson/simdjson/issues/906
* Basically, we want to make sure that if the parsing continues beyond the last (valid)
* structural character, it quickly stops.
* Only three structural characters can be repeated without triggering an error in JSON: [,] and }.
* We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
* continues, then it must be [,] or }.
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
* It can't be a comma, a colon or any simple value. So the only way we could continue is
* if the repeated character is [. But if so, the document must start with [. But if the document
* starts with [, it should end with ]. If we enforce that rule, then we would get
* ][[ which is invalid.
*
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
* R"({ "a": [,,)"
**/
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
parser.next_structural_index = 0;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
return EMPTY;
}
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
return UNEXPECTED_ERROR;
}
if (partial == stage1_mode::streaming_partial) {
// If we have an unclosed string, then the last structural
// will be the quote and we want to make sure to omit it.
if(have_unclosed_string) {
parser.n_structural_indexes--;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
}
// We truncate the input to the end of the last complete document (or zero).
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
if(parser.structural_indexes[0] == 0) {
// If the buffer is partial and we started at index 0 but the document is
// incomplete, it's too big to parse.
return CAPACITY;
} else {
// It is possible that the document could be parsed, we just had a lot
// of white space.
parser.n_structural_indexes = 0;
return EMPTY;
}
}
parser.n_structural_indexes = new_structural_indexes;
} else if (partial == stage1_mode::streaming_final) {
if(have_unclosed_string) { parser.n_structural_indexes--; }
// We truncate the input to the end of the last complete document (or zero).
// Because partial == stage1_mode::streaming_final, it means that we may
// silently ignore trailing garbage. Though it sounds bad, we do it
// deliberately because many people who have streams of JSON documents
// will truncate them for processing. E.g., imagine that you are uncompressing
// the data from a size file or receiving it in chunks from the network. You
// may not know where exactly the last document will be. Meanwhile the
// document_stream instances allow people to know the JSON documents they are
// parsing (see the iterator.source() method).
parser.n_structural_indexes = find_next_document_index(parser);
// We store the initial n_structural_indexes so that the client can see
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
// then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
// otherwise, it will copy some prior index.
parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
// This next line is critical, do not change it unless you understand what you are
// doing.
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
// We tolerate an unclosed string at the very end of the stream. Indeed, users
// often load their data in bulk without being careful and they want us to ignore
// the trailing garbage.
return EMPTY;
}
}
checker.check_eof();
return checker.errors();
}
} // namespace stage1
} // unnamed namespace
} // namespace icelake
} // namespace simdjson
// Clear CUSTOM_BIT_INDEXER so other implementations can set it if they need to.
#undef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H
/* end file generic/stage1/json_structural_indexer.h for icelake */
/* including generic/stage1/utf8_validator.h for icelake: #include <generic/stage1/utf8_validator.h> */
/* begin file generic/stage1/utf8_validator.h for icelake */
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H */
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
/* amalgamation skipped (editor-only): #include <generic/stage1/utf8_lookup4_algorithm.h> */
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
namespace simdjson {
namespace icelake {
namespace {
namespace stage1 {
/**
* Validates that the string is actual UTF-8.
*/
template<class checker>
bool generic_validate_utf8(const uint8_t * input, size_t length) {
checker c{};
buf_block_reader<64> reader(input, length);
simdjson.cpp view on Meta::CPAN
} // namespace simdjson
/**
* We provide a custom version of bit_indexer::write using
* naked intrinsics.
* TODO: make this code more elegant.
*/
// Under GCC 12, the intrinsic _mm512_extracti32x4_epi32 may generate 'maybe uninitialized'.
// as a workaround, we disable warnings within the following function.
SIMDJSON_PUSH_DISABLE_ALL_WARNINGS
namespace simdjson { namespace icelake { namespace { namespace stage1 {
simdjson_inline void bit_indexer::write(uint32_t idx, uint64_t bits) {
// In some instances, the next branch is expensive because it is mispredicted.
// Unfortunately, in other cases,
// it helps tremendously.
if (bits == 0) { return; }
const __m512i indexes = _mm512_maskz_compress_epi8(bits, _mm512_set_epi32(
0x3f3e3d3c, 0x3b3a3938, 0x37363534, 0x33323130,
0x2f2e2d2c, 0x2b2a2928, 0x27262524, 0x23222120,
0x1f1e1d1c, 0x1b1a1918, 0x17161514, 0x13121110,
0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100
));
const __m512i start_index = _mm512_set1_epi32(idx);
const auto count = count_ones(bits);
__m512i t0 = _mm512_cvtepu8_epi32(_mm512_castsi512_si128(indexes));
_mm512_storeu_si512(this->tail, _mm512_add_epi32(t0, start_index));
if(count > 16) {
const __m512i t1 = _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(indexes, 1));
_mm512_storeu_si512(this->tail + 16, _mm512_add_epi32(t1, start_index));
if(count > 32) {
const __m512i t2 = _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(indexes, 2));
_mm512_storeu_si512(this->tail + 32, _mm512_add_epi32(t2, start_index));
if(count > 48) {
const __m512i t3 = _mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(indexes, 3));
_mm512_storeu_si512(this->tail + 48, _mm512_add_epi32(t3, start_index));
}
}
}
this->tail += count;
}
}}}}
SIMDJSON_POP_DISABLE_WARNINGS
//
// Stage 2
//
//
// Implementation-specific overrides
//
namespace simdjson {
namespace icelake {
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return icelake::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
this->buf = _buf;
this->len = _len;
return icelake::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
}
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return icelake::stage1::generic_validate_utf8(buf,len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<true>(*this, _doc);
}
SIMDJSON_NO_SANITIZE_MEMORY
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool replacement_char) const noexcept {
return icelake::stringparsing::parse_string(src, dst, replacement_char);
}
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept {
return icelake::stringparsing::parse_wobbly_string(src, dst);
}
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
auto error = stage1(_buf, _len, stage1_mode::regular);
if (error) { return error; }
return stage2(_doc);
}
} // namespace icelake
} // namespace simdjson
/* including simdjson/icelake/end.h: #include <simdjson/icelake/end.h> */
/* begin file simdjson/icelake/end.h */
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
/* amalgamation skipped (editor-only): #include "simdjson/icelake/base.h" */
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
#if !SIMDJSON_CAN_ALWAYS_RUN_ICELAKE
SIMDJSON_UNTARGET_REGION
#endif
/* undefining SIMDJSON_IMPLEMENTATION from "icelake" */
#undef SIMDJSON_IMPLEMENTATION
/* end file simdjson/icelake/end.h */
#endif // SIMDJSON_SRC_ICELAKE_CPP
/* end file icelake.cpp */
#endif
#if SIMDJSON_IMPLEMENTATION_PPC64
/* including ppc64.cpp: #include <ppc64.cpp> */
/* begin file ppc64.cpp */
#ifndef SIMDJSON_SRC_PPC64_CPP
#define SIMDJSON_SRC_PPC64_CPP
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
/* amalgamation skipped (editor-only): #include <base.h> */
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
/* including simdjson/ppc64.h: #include <simdjson/ppc64.h> */
simdjson.cpp view on Meta::CPAN
private:
simdjson_inline json_structural_indexer(uint32_t *structural_indexes);
template<size_t STEP_SIZE>
simdjson_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
simdjson_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial);
json_scanner scanner{};
utf8_checker checker{};
bit_indexer indexer;
uint64_t prev_structurals = 0;
uint64_t unescaped_chars_error = 0;
};
simdjson_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
// Skip the last character if it is partial
simdjson_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
if (simdjson_unlikely(len < 3)) {
switch (len) {
case 2:
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
return len;
case 1:
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
return len;
case 0:
return len;
}
}
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 1 byte left
if (buf[len-3] >= 0xf0) { return len-3; } // 4-byte characters with only 3 bytes left
return len;
}
//
// PERF NOTES:
// We pipe 2 inputs through these stages:
// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
// The output of step 1 depends entirely on this information. These functions don't quite use
// up enough CPU: the second half of the functions is highly serial, only using 1 execution core
// at a time. The second input's scans has some dependency on the first ones finishing it, but
// they can make a lot of progress before they need that information.
// 3. Step 1 does not use enough capacity, so we run some extra stuff while we're waiting for that
// to finish: utf-8 checks and generating the output from the last iteration.
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
// We guard the rest of the code so that we can assume that len > 0 throughout.
if (len == 0) { return EMPTY; }
if (is_streaming(partial)) {
len = trim_partial_utf8(buf, len);
// If you end up with an empty window after trimming
// the partial UTF-8 bytes, then chances are good that you
// have an UTF-8 formatting error.
if(len == 0) { return UTF8_ERROR; }
}
buf_block_reader<STEP_SIZE> reader(buf, len);
json_structural_indexer indexer(parser.structural_indexes.get());
// Read all but the last block
while (reader.has_full_block()) {
indexer.step<STEP_SIZE>(reader.full_block(), reader);
}
// Take care of the last block (will always be there unless file is empty which is
// not supposed to happen.)
uint8_t block[STEP_SIZE];
if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; }
indexer.step<STEP_SIZE>(block, reader);
return indexer.finish(parser, reader.block_index(), len, partial);
}
template<>
simdjson_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
simd::simd8x64<uint8_t> in_1(block);
simd::simd8x64<uint8_t> in_2(block+64);
json_block block_1 = scanner.next(in_1);
json_block block_2 = scanner.next(in_2);
this->next(in_1, block_1, reader.block_index());
this->next(in_2, block_2, reader.block_index()+64);
reader.advance();
}
template<>
simdjson_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
simd::simd8x64<uint8_t> in_1(block);
json_block block_1 = scanner.next(in_1);
this->next(in_1, block_1, reader.block_index());
reader.advance();
}
simdjson_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
uint64_t unescaped = in.lteq(0x1F);
#if SIMDJSON_UTF8VALIDATION
checker.check_next_input(in);
#endif
indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
prev_structurals = block.structural_start();
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
// Write out the final iteration's structurals
indexer.write(uint32_t(idx-64), prev_structurals);
error_code error = scanner.finish();
// We deliberately break down the next expression so that it is
// human readable.
const bool should_we_exit = is_streaming(partial) ?
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
: (error != SUCCESS); // if partial is false, we must have SUCCESS
const bool have_unclosed_string = (error == UNCLOSED_STRING);
if (simdjson_unlikely(should_we_exit)) { return error; }
if (unescaped_chars_error) {
return UNESCAPED_CHARS;
}
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
/***
* The On-Demand API requires special padding.
*
* This is related to https://github.com/simdjson/simdjson/issues/906
* Basically, we want to make sure that if the parsing continues beyond the last (valid)
* structural character, it quickly stops.
* Only three structural characters can be repeated without triggering an error in JSON: [,] and }.
* We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
* continues, then it must be [,] or }.
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
* It can't be a comma, a colon or any simple value. So the only way we could continue is
* if the repeated character is [. But if so, the document must start with [. But if the document
* starts with [, it should end with ]. If we enforce that rule, then we would get
* ][[ which is invalid.
*
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
* R"({ "a": [,,)"
**/
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
parser.next_structural_index = 0;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
return EMPTY;
}
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
return UNEXPECTED_ERROR;
}
if (partial == stage1_mode::streaming_partial) {
// If we have an unclosed string, then the last structural
// will be the quote and we want to make sure to omit it.
if(have_unclosed_string) {
parser.n_structural_indexes--;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
}
// We truncate the input to the end of the last complete document (or zero).
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
if(parser.structural_indexes[0] == 0) {
// If the buffer is partial and we started at index 0 but the document is
// incomplete, it's too big to parse.
return CAPACITY;
} else {
// It is possible that the document could be parsed, we just had a lot
// of white space.
parser.n_structural_indexes = 0;
return EMPTY;
}
}
parser.n_structural_indexes = new_structural_indexes;
} else if (partial == stage1_mode::streaming_final) {
if(have_unclosed_string) { parser.n_structural_indexes--; }
// We truncate the input to the end of the last complete document (or zero).
// Because partial == stage1_mode::streaming_final, it means that we may
// silently ignore trailing garbage. Though it sounds bad, we do it
// deliberately because many people who have streams of JSON documents
// will truncate them for processing. E.g., imagine that you are uncompressing
// the data from a size file or receiving it in chunks from the network. You
// may not know where exactly the last document will be. Meanwhile the
// document_stream instances allow people to know the JSON documents they are
// parsing (see the iterator.source() method).
parser.n_structural_indexes = find_next_document_index(parser);
// We store the initial n_structural_indexes so that the client can see
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
// then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
// otherwise, it will copy some prior index.
parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
// This next line is critical, do not change it unless you understand what you are
// doing.
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
// We tolerate an unclosed string at the very end of the stream. Indeed, users
// often load their data in bulk without being careful and they want us to ignore
// the trailing garbage.
return EMPTY;
}
}
checker.check_eof();
return checker.errors();
}
} // namespace stage1
} // unnamed namespace
} // namespace ppc64
} // namespace simdjson
// Clear CUSTOM_BIT_INDEXER so other implementations can set it if they need to.
#undef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H
/* end file generic/stage1/json_structural_indexer.h for ppc64 */
/* including generic/stage1/utf8_validator.h for ppc64: #include <generic/stage1/utf8_validator.h> */
/* begin file generic/stage1/utf8_validator.h for ppc64 */
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H */
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
/* amalgamation skipped (editor-only): #include <generic/stage1/utf8_lookup4_algorithm.h> */
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
namespace simdjson {
namespace ppc64 {
namespace {
namespace stage1 {
/**
* Validates that the string is actual UTF-8.
*/
template<class checker>
bool generic_validate_utf8(const uint8_t * input, size_t length) {
checker c{};
buf_block_reader<64> reader(input, length);
simdjson.cpp view on Meta::CPAN
(in.chunks[0] & 0xf).lookup_16(table1) & (in.chunks[0].shr<4>()).lookup_16(table2),
(in.chunks[1] & 0xf).lookup_16(table1) & (in.chunks[1].shr<4>()).lookup_16(table2),
(in.chunks[2] & 0xf).lookup_16(table1) & (in.chunks[2].shr<4>()).lookup_16(table2),
(in.chunks[3] & 0xf).lookup_16(table1) & (in.chunks[3].shr<4>()).lookup_16(table2)
);
uint64_t op = simd8x64<bool>(
v.chunks[0].any_bits_set(0x7),
v.chunks[1].any_bits_set(0x7),
v.chunks[2].any_bits_set(0x7),
v.chunks[3].any_bits_set(0x7)
).to_bitmask();
uint64_t whitespace = simd8x64<bool>(
v.chunks[0].any_bits_set(0x18),
v.chunks[1].any_bits_set(0x18),
v.chunks[2].any_bits_set(0x18),
v.chunks[3].any_bits_set(0x18)
).to_bitmask();
return { whitespace, op };
}
simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input) {
// careful: 0x80 is not ascii.
return input.reduce_or().saturating_sub(0x7fu).bits_not_set_anywhere();
}
simdjson_unused simdjson_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<uint8_t> is_second_byte = prev1.saturating_sub(0xc0u-1); // Only 11______ will be > 0
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-1); // Only 111_____ will be > 0
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-1); // Only 1111____ will be > 0
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
}
simdjson_inline simd8<uint8_t> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be >= 0x80
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be >= 0x80
return is_third_byte | is_fourth_byte;
}
} // unnamed namespace
} // namespace ppc64
} // namespace simdjson
//
// Stage 2
//
//
// Implementation-specific overrides
//
namespace simdjson {
namespace ppc64 {
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return ppc64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
this->buf = _buf;
this->len = _len;
return ppc64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
}
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return ppc64::stage1::generic_validate_utf8(buf,len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<true>(*this, _doc);
}
SIMDJSON_NO_SANITIZE_MEMORY
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool replacement_char) const noexcept {
return ppc64::stringparsing::parse_string(src, dst, replacement_char);
}
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept {
return ppc64::stringparsing::parse_wobbly_string(src, dst);
}
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
auto error = stage1(_buf, _len, stage1_mode::regular);
if (error) { return error; }
return stage2(_doc);
}
} // namespace ppc64
} // namespace simdjson
/* including simdjson/ppc64/end.h: #include <simdjson/ppc64/end.h> */
/* begin file simdjson/ppc64/end.h */
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
/* amalgamation skipped (editor-only): #include "simdjson/ppc64/base.h" */
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
#undef SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT
/* undefining SIMDJSON_IMPLEMENTATION from "ppc64" */
#undef SIMDJSON_IMPLEMENTATION
/* end file simdjson/ppc64/end.h */
#endif // SIMDJSON_SRC_PPC64_CPP
/* end file ppc64.cpp */
#endif
#if SIMDJSON_IMPLEMENTATION_WESTMERE
/* including westmere.cpp: #include <westmere.cpp> */
/* begin file westmere.cpp */
#ifndef SIMDJSON_SRC_WESTMERE_CPP
#define SIMDJSON_SRC_WESTMERE_CPP
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
/* amalgamation skipped (editor-only): #include <base.h> */
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
/* including simdjson/westmere.h: #include <simdjson/westmere.h> */
/* begin file simdjson/westmere.h */
#ifndef SIMDJSON_WESTMERE_H
#define SIMDJSON_WESTMERE_H
simdjson.cpp view on Meta::CPAN
private:
simdjson_inline json_structural_indexer(uint32_t *structural_indexes);
template<size_t STEP_SIZE>
simdjson_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
simdjson_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial);
json_scanner scanner{};
utf8_checker checker{};
bit_indexer indexer;
uint64_t prev_structurals = 0;
uint64_t unescaped_chars_error = 0;
};
simdjson_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
// Skip the last character if it is partial
simdjson_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
if (simdjson_unlikely(len < 3)) {
switch (len) {
case 2:
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
return len;
case 1:
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
return len;
case 0:
return len;
}
}
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 1 byte left
if (buf[len-3] >= 0xf0) { return len-3; } // 4-byte characters with only 3 bytes left
return len;
}
//
// PERF NOTES:
// We pipe 2 inputs through these stages:
// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
// The output of step 1 depends entirely on this information. These functions don't quite use
// up enough CPU: the second half of the functions is highly serial, only using 1 execution core
// at a time. The second input's scans has some dependency on the first ones finishing it, but
// they can make a lot of progress before they need that information.
// 3. Step 1 does not use enough capacity, so we run some extra stuff while we're waiting for that
// to finish: utf-8 checks and generating the output from the last iteration.
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
// We guard the rest of the code so that we can assume that len > 0 throughout.
if (len == 0) { return EMPTY; }
if (is_streaming(partial)) {
len = trim_partial_utf8(buf, len);
// If you end up with an empty window after trimming
// the partial UTF-8 bytes, then chances are good that you
// have an UTF-8 formatting error.
if(len == 0) { return UTF8_ERROR; }
}
buf_block_reader<STEP_SIZE> reader(buf, len);
json_structural_indexer indexer(parser.structural_indexes.get());
// Read all but the last block
while (reader.has_full_block()) {
indexer.step<STEP_SIZE>(reader.full_block(), reader);
}
// Take care of the last block (will always be there unless file is empty which is
// not supposed to happen.)
uint8_t block[STEP_SIZE];
if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; }
indexer.step<STEP_SIZE>(block, reader);
return indexer.finish(parser, reader.block_index(), len, partial);
}
template<>
simdjson_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
simd::simd8x64<uint8_t> in_1(block);
simd::simd8x64<uint8_t> in_2(block+64);
json_block block_1 = scanner.next(in_1);
json_block block_2 = scanner.next(in_2);
this->next(in_1, block_1, reader.block_index());
this->next(in_2, block_2, reader.block_index()+64);
reader.advance();
}
template<>
simdjson_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
simd::simd8x64<uint8_t> in_1(block);
json_block block_1 = scanner.next(in_1);
this->next(in_1, block_1, reader.block_index());
reader.advance();
}
simdjson_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
uint64_t unescaped = in.lteq(0x1F);
#if SIMDJSON_UTF8VALIDATION
checker.check_next_input(in);
#endif
indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
prev_structurals = block.structural_start();
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
// Write out the final iteration's structurals
indexer.write(uint32_t(idx-64), prev_structurals);
error_code error = scanner.finish();
// We deliberately break down the next expression so that it is
// human readable.
const bool should_we_exit = is_streaming(partial) ?
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
: (error != SUCCESS); // if partial is false, we must have SUCCESS
const bool have_unclosed_string = (error == UNCLOSED_STRING);
if (simdjson_unlikely(should_we_exit)) { return error; }
if (unescaped_chars_error) {
return UNESCAPED_CHARS;
}
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
/***
* The On-Demand API requires special padding.
*
* This is related to https://github.com/simdjson/simdjson/issues/906
* Basically, we want to make sure that if the parsing continues beyond the last (valid)
* structural character, it quickly stops.
* Only three structural characters can be repeated without triggering an error in JSON: [,] and }.
* We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
* continues, then it must be [,] or }.
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
* It can't be a comma, a colon or any simple value. So the only way we could continue is
* if the repeated character is [. But if so, the document must start with [. But if the document
* starts with [, it should end with ]. If we enforce that rule, then we would get
* ][[ which is invalid.
*
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
* R"({ "a": [,,)"
**/
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
parser.next_structural_index = 0;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
return EMPTY;
}
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
return UNEXPECTED_ERROR;
}
if (partial == stage1_mode::streaming_partial) {
// If we have an unclosed string, then the last structural
// will be the quote and we want to make sure to omit it.
if(have_unclosed_string) {
parser.n_structural_indexes--;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
}
// We truncate the input to the end of the last complete document (or zero).
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
if(parser.structural_indexes[0] == 0) {
// If the buffer is partial and we started at index 0 but the document is
// incomplete, it's too big to parse.
return CAPACITY;
} else {
// It is possible that the document could be parsed, we just had a lot
// of white space.
parser.n_structural_indexes = 0;
return EMPTY;
}
}
parser.n_structural_indexes = new_structural_indexes;
} else if (partial == stage1_mode::streaming_final) {
if(have_unclosed_string) { parser.n_structural_indexes--; }
// We truncate the input to the end of the last complete document (or zero).
// Because partial == stage1_mode::streaming_final, it means that we may
// silently ignore trailing garbage. Though it sounds bad, we do it
// deliberately because many people who have streams of JSON documents
// will truncate them for processing. E.g., imagine that you are uncompressing
// the data from a size file or receiving it in chunks from the network. You
// may not know where exactly the last document will be. Meanwhile the
// document_stream instances allow people to know the JSON documents they are
// parsing (see the iterator.source() method).
parser.n_structural_indexes = find_next_document_index(parser);
// We store the initial n_structural_indexes so that the client can see
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
// then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
// otherwise, it will copy some prior index.
parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
// This next line is critical, do not change it unless you understand what you are
// doing.
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
// We tolerate an unclosed string at the very end of the stream. Indeed, users
// often load their data in bulk without being careful and they want us to ignore
// the trailing garbage.
return EMPTY;
}
}
checker.check_eof();
return checker.errors();
}
} // namespace stage1
} // unnamed namespace
} // namespace westmere
} // namespace simdjson
// Clear CUSTOM_BIT_INDEXER so other implementations can set it if they need to.
#undef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H
/* end file generic/stage1/json_structural_indexer.h for westmere */
/* including generic/stage1/utf8_validator.h for westmere: #include <generic/stage1/utf8_validator.h> */
/* begin file generic/stage1/utf8_validator.h for westmere */
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H */
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
/* amalgamation skipped (editor-only): #include <generic/stage1/utf8_lookup4_algorithm.h> */
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
namespace simdjson {
namespace westmere {
namespace {
namespace stage1 {
/**
* Validates that the string is actual UTF-8.
*/
template<class checker>
bool generic_validate_utf8(const uint8_t * input, size_t length) {
checker c{};
buf_block_reader<64> reader(input, length);
simdjson.cpp view on Meta::CPAN
const uint64_t whitespace = in.eq({
_mm_shuffle_epi8(whitespace_table, in.chunks[0]),
_mm_shuffle_epi8(whitespace_table, in.chunks[1]),
_mm_shuffle_epi8(whitespace_table, in.chunks[2]),
_mm_shuffle_epi8(whitespace_table, in.chunks[3])
});
// Turn [ and ] into { and }
const simd8x64<uint8_t> curlified{
in.chunks[0] | 0x20,
in.chunks[1] | 0x20,
in.chunks[2] | 0x20,
in.chunks[3] | 0x20
};
const uint64_t op = curlified.eq({
_mm_shuffle_epi8(op_table, in.chunks[0]),
_mm_shuffle_epi8(op_table, in.chunks[1]),
_mm_shuffle_epi8(op_table, in.chunks[2]),
_mm_shuffle_epi8(op_table, in.chunks[3])
});
return { whitespace, op };
}
simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input) {
return input.reduce_or().is_ascii();
}
simdjson_unused simdjson_inline simd8<bool> must_be_continuation(const simd8<uint8_t> prev1, const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<uint8_t> is_second_byte = prev1.saturating_sub(0xc0u-1); // Only 11______ will be > 0
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-1); // Only 111_____ will be > 0
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-1); // Only 1111____ will be > 0
// Caller requires a bool (all 1's). All values resulting from the subtraction will be <= 64, so signed comparison is fine.
return simd8<int8_t>(is_second_byte | is_third_byte | is_fourth_byte) > int8_t(0);
}
simdjson_inline simd8<uint8_t> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be >= 0x80
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be >= 0x80
return is_third_byte | is_fourth_byte;
}
} // unnamed namespace
} // namespace westmere
} // namespace simdjson
//
// Stage 2
//
//
// Implementation-specific overrides
//
namespace simdjson {
namespace westmere {
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
this->buf = _buf;
this->len = _len;
return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming);
}
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return westmere::stage1::generic_validate_utf8(buf,len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<true>(*this, _doc);
}
SIMDJSON_NO_SANITIZE_MEMORY
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool replacement_char) const noexcept {
return westmere::stringparsing::parse_string(src, dst, replacement_char);
}
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept {
return westmere::stringparsing::parse_wobbly_string(src, dst);
}
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
auto error = stage1(_buf, _len, stage1_mode::regular);
if (error) { return error; }
return stage2(_doc);
}
} // namespace westmere
} // namespace simdjson
/* including simdjson/westmere/end.h: #include <simdjson/westmere/end.h> */
/* begin file simdjson/westmere/end.h */
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
/* amalgamation skipped (editor-only): #include "simdjson/westmere/base.h" */
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
#if !SIMDJSON_CAN_ALWAYS_RUN_WESTMERE
SIMDJSON_UNTARGET_REGION
#endif
/* undefining SIMDJSON_IMPLEMENTATION from "westmere" */
#undef SIMDJSON_IMPLEMENTATION
/* end file simdjson/westmere/end.h */
#endif // SIMDJSON_SRC_WESTMERE_CPP
/* end file westmere.cpp */
#endif
#if SIMDJSON_IMPLEMENTATION_LSX
/* including lsx.cpp: #include <lsx.cpp> */
/* begin file lsx.cpp */
#ifndef SIMDJSON_SRC_LSX_CPP
#define SIMDJSON_SRC_LSX_CPP
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
/* amalgamation skipped (editor-only): #include <base.h> */
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
/* including simdjson/lsx.h: #include <simdjson/lsx.h> */
simdjson.cpp view on Meta::CPAN
private:
simdjson_inline json_structural_indexer(uint32_t *structural_indexes);
template<size_t STEP_SIZE>
simdjson_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
simdjson_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial);
json_scanner scanner{};
utf8_checker checker{};
bit_indexer indexer;
uint64_t prev_structurals = 0;
uint64_t unescaped_chars_error = 0;
};
simdjson_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
// Skip the last character if it is partial
simdjson_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
if (simdjson_unlikely(len < 3)) {
switch (len) {
case 2:
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
return len;
case 1:
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
return len;
case 0:
return len;
}
}
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 1 byte left
if (buf[len-3] >= 0xf0) { return len-3; } // 4-byte characters with only 3 bytes left
return len;
}
//
// PERF NOTES:
// We pipe 2 inputs through these stages:
// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
// The output of step 1 depends entirely on this information. These functions don't quite use
// up enough CPU: the second half of the functions is highly serial, only using 1 execution core
// at a time. The second input's scans has some dependency on the first ones finishing it, but
// they can make a lot of progress before they need that information.
// 3. Step 1 does not use enough capacity, so we run some extra stuff while we're waiting for that
// to finish: utf-8 checks and generating the output from the last iteration.
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
// We guard the rest of the code so that we can assume that len > 0 throughout.
if (len == 0) { return EMPTY; }
if (is_streaming(partial)) {
len = trim_partial_utf8(buf, len);
// If you end up with an empty window after trimming
// the partial UTF-8 bytes, then chances are good that you
// have an UTF-8 formatting error.
if(len == 0) { return UTF8_ERROR; }
}
buf_block_reader<STEP_SIZE> reader(buf, len);
json_structural_indexer indexer(parser.structural_indexes.get());
// Read all but the last block
while (reader.has_full_block()) {
indexer.step<STEP_SIZE>(reader.full_block(), reader);
}
// Take care of the last block (will always be there unless file is empty which is
// not supposed to happen.)
uint8_t block[STEP_SIZE];
if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; }
indexer.step<STEP_SIZE>(block, reader);
return indexer.finish(parser, reader.block_index(), len, partial);
}
template<>
simdjson_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
simd::simd8x64<uint8_t> in_1(block);
simd::simd8x64<uint8_t> in_2(block+64);
json_block block_1 = scanner.next(in_1);
json_block block_2 = scanner.next(in_2);
this->next(in_1, block_1, reader.block_index());
this->next(in_2, block_2, reader.block_index()+64);
reader.advance();
}
template<>
simdjson_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
simd::simd8x64<uint8_t> in_1(block);
json_block block_1 = scanner.next(in_1);
this->next(in_1, block_1, reader.block_index());
reader.advance();
}
simdjson_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
uint64_t unescaped = in.lteq(0x1F);
#if SIMDJSON_UTF8VALIDATION
checker.check_next_input(in);
#endif
indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
prev_structurals = block.structural_start();
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
// Write out the final iteration's structurals
indexer.write(uint32_t(idx-64), prev_structurals);
error_code error = scanner.finish();
// We deliberately break down the next expression so that it is
// human readable.
const bool should_we_exit = is_streaming(partial) ?
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
: (error != SUCCESS); // if partial is false, we must have SUCCESS
const bool have_unclosed_string = (error == UNCLOSED_STRING);
if (simdjson_unlikely(should_we_exit)) { return error; }
if (unescaped_chars_error) {
return UNESCAPED_CHARS;
}
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
/***
* The On-Demand API requires special padding.
*
* This is related to https://github.com/simdjson/simdjson/issues/906
* Basically, we want to make sure that if the parsing continues beyond the last (valid)
* structural character, it quickly stops.
* Only three structural characters can be repeated without triggering an error in JSON: [,] and }.
* We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
* continues, then it must be [,] or }.
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
* It can't be a comma, a colon or any simple value. So the only way we could continue is
* if the repeated character is [. But if so, the document must start with [. But if the document
* starts with [, it should end with ]. If we enforce that rule, then we would get
* ][[ which is invalid.
*
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
* R"({ "a": [,,)"
**/
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
parser.next_structural_index = 0;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
return EMPTY;
}
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
return UNEXPECTED_ERROR;
}
if (partial == stage1_mode::streaming_partial) {
// If we have an unclosed string, then the last structural
// will be the quote and we want to make sure to omit it.
if(have_unclosed_string) {
parser.n_structural_indexes--;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
}
// We truncate the input to the end of the last complete document (or zero).
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
if(parser.structural_indexes[0] == 0) {
// If the buffer is partial and we started at index 0 but the document is
// incomplete, it's too big to parse.
return CAPACITY;
} else {
// It is possible that the document could be parsed, we just had a lot
// of white space.
parser.n_structural_indexes = 0;
return EMPTY;
}
}
parser.n_structural_indexes = new_structural_indexes;
} else if (partial == stage1_mode::streaming_final) {
if(have_unclosed_string) { parser.n_structural_indexes--; }
// We truncate the input to the end of the last complete document (or zero).
// Because partial == stage1_mode::streaming_final, it means that we may
// silently ignore trailing garbage. Though it sounds bad, we do it
// deliberately because many people who have streams of JSON documents
// will truncate them for processing. E.g., imagine that you are uncompressing
// the data from a size file or receiving it in chunks from the network. You
// may not know where exactly the last document will be. Meanwhile the
// document_stream instances allow people to know the JSON documents they are
// parsing (see the iterator.source() method).
parser.n_structural_indexes = find_next_document_index(parser);
// We store the initial n_structural_indexes so that the client can see
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
// then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
// otherwise, it will copy some prior index.
parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
// This next line is critical, do not change it unless you understand what you are
// doing.
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
// We tolerate an unclosed string at the very end of the stream. Indeed, users
// often load their data in bulk without being careful and they want us to ignore
// the trailing garbage.
return EMPTY;
}
}
checker.check_eof();
return checker.errors();
}
} // namespace stage1
} // unnamed namespace
} // namespace lsx
} // namespace simdjson
// Clear CUSTOM_BIT_INDEXER so other implementations can set it if they need to.
#undef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H
/* end file generic/stage1/json_structural_indexer.h for lsx */
/* including generic/stage1/utf8_validator.h for lsx: #include <generic/stage1/utf8_validator.h> */
/* begin file generic/stage1/utf8_validator.h for lsx */
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H */
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
/* amalgamation skipped (editor-only): #include <generic/stage1/utf8_lookup4_algorithm.h> */
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
namespace simdjson {
namespace lsx {
namespace {
namespace stage1 {
/**
* Validates that the string is actual UTF-8.
*/
template<class checker>
bool generic_validate_utf8(const uint8_t * input, size_t length) {
checker c{};
buf_block_reader<64> reader(input, length);
simdjson.cpp view on Meta::CPAN
simdjson_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
// Inspired by haswell.
// LSX use low 5 bits as index. For the 6 operators (:,[]{}), the unique-5bits is [6:2].
// The ASCII white-space and operators have these values: (char, hex, unique-5bits)
// (' ', 20, 00000) ('\t', 09, 01001) ('\n', 0A, 01010) ('\r', 0D, 01101)
// (',', 2C, 01011) (':', 3A, 01110) ('[', 5B, 10110) ('{', 7B, 11110) (']', 5D, 10111) ('}', 7D, 11111)
const simd8<uint8_t> ws_table = simd8<uint8_t>::repeat_16(
' ', 0, 0, 0, 0, 0, 0, 0, 0, '\t', '\n', 0, 0, '\r', 0, 0
);
const simd8<uint8_t> op_table_lo = simd8<uint8_t>::repeat_16(
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ',', 0, 0, ':', 0
);
const simd8<uint8_t> op_table_hi = simd8<uint8_t>::repeat_16(
0, 0, 0, 0, 0, 0, '[', ']', 0, 0, 0, 0, 0, 0, '{', '}'
);
uint64_t ws = in.eq({
in.chunks[0].lookup_16(ws_table),
in.chunks[1].lookup_16(ws_table),
in.chunks[2].lookup_16(ws_table),
in.chunks[3].lookup_16(ws_table)
});
uint64_t op = in.eq({
__lsx_vshuf_b(op_table_hi, op_table_lo, in.chunks[0].shr<2>()),
__lsx_vshuf_b(op_table_hi, op_table_lo, in.chunks[1].shr<2>()),
__lsx_vshuf_b(op_table_hi, op_table_lo, in.chunks[2].shr<2>()),
__lsx_vshuf_b(op_table_hi, op_table_lo, in.chunks[3].shr<2>())
});
return { ws, op };
}
simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input) {
return input.reduce_or().is_ascii();
}
simdjson_inline simd8<uint8_t> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be >= 0x80
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be >= 0x80
return is_third_byte | is_fourth_byte;
}
} // unnamed namespace
} // namespace lsx
} // namespace simdjson
//
// Stage 2
//
//
// Implementation-specific overrides
//
namespace simdjson {
namespace lsx {
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return lsx::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
this->buf = _buf;
this->len = _len;
return lsx::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
}
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return lsx::stage1::generic_validate_utf8(buf,len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<true>(*this, _doc);
}
SIMDJSON_NO_SANITIZE_MEMORY
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) const noexcept {
return lsx::stringparsing::parse_string(src, dst, allow_replacement);
}
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept {
return lsx::stringparsing::parse_wobbly_string(src, dst);
}
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
auto error = stage1(_buf, _len, stage1_mode::regular);
if (error) { return error; }
return stage2(_doc);
}
} // namespace lsx
} // namespace simdjson
/* including simdjson/lsx/end.h: #include <simdjson/lsx/end.h> */
/* begin file simdjson/lsx/end.h */
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
/* amalgamation skipped (editor-only): #include "simdjson/lsx/base.h" */
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
#undef SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT
/* undefining SIMDJSON_IMPLEMENTATION from "lsx" */
#undef SIMDJSON_IMPLEMENTATION
/* end file simdjson/lsx/end.h */
#endif // SIMDJSON_SRC_LSX_CPP
/* end file lsx.cpp */
#endif
#if SIMDJSON_IMPLEMENTATION_LASX
/* including lasx.cpp: #include <lasx.cpp> */
/* begin file lasx.cpp */
#ifndef SIMDJSON_SRC_LASX_CPP
#define SIMDJSON_SRC_LASX_CPP
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
/* amalgamation skipped (editor-only): #include <base.h> */
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
/* including simdjson/lasx.h: #include <simdjson/lasx.h> */
/* begin file simdjson/lasx.h */
#ifndef SIMDJSON_LASX_H
#define SIMDJSON_LASX_H
simdjson.cpp view on Meta::CPAN
private:
simdjson_inline json_structural_indexer(uint32_t *structural_indexes);
template<size_t STEP_SIZE>
simdjson_inline void step(const uint8_t *block, buf_block_reader<STEP_SIZE> &reader) noexcept;
simdjson_inline void next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx);
simdjson_inline error_code finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial);
json_scanner scanner{};
utf8_checker checker{};
bit_indexer indexer;
uint64_t prev_structurals = 0;
uint64_t unescaped_chars_error = 0;
};
simdjson_inline json_structural_indexer::json_structural_indexer(uint32_t *structural_indexes) : indexer{structural_indexes} {}
// Skip the last character if it is partial
simdjson_inline size_t trim_partial_utf8(const uint8_t *buf, size_t len) {
if (simdjson_unlikely(len < 3)) {
switch (len) {
case 2:
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 2 bytes left
return len;
case 1:
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
return len;
case 0:
return len;
}
}
if (buf[len-1] >= 0xc0) { return len-1; } // 2-, 3- and 4-byte characters with only 1 byte left
if (buf[len-2] >= 0xe0) { return len-2; } // 3- and 4-byte characters with only 1 byte left
if (buf[len-3] >= 0xf0) { return len-3; } // 4-byte characters with only 3 bytes left
return len;
}
//
// PERF NOTES:
// We pipe 2 inputs through these stages:
// 1. Load JSON into registers. This takes a long time and is highly parallelizable, so we load
// 2 inputs' worth at once so that by the time step 2 is looking for them input, it's available.
// 2. Scan the JSON for critical data: strings, scalars and operators. This is the critical path.
// The output of step 1 depends entirely on this information. These functions don't quite use
// up enough CPU: the second half of the functions is highly serial, only using 1 execution core
// at a time. The second input's scans has some dependency on the first ones finishing it, but
// they can make a lot of progress before they need that information.
// 3. Step 1 does not use enough capacity, so we run some extra stuff while we're waiting for that
// to finish: utf-8 checks and generating the output from the last iteration.
//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
// We guard the rest of the code so that we can assume that len > 0 throughout.
if (len == 0) { return EMPTY; }
if (is_streaming(partial)) {
len = trim_partial_utf8(buf, len);
// If you end up with an empty window after trimming
// the partial UTF-8 bytes, then chances are good that you
// have an UTF-8 formatting error.
if(len == 0) { return UTF8_ERROR; }
}
buf_block_reader<STEP_SIZE> reader(buf, len);
json_structural_indexer indexer(parser.structural_indexes.get());
// Read all but the last block
while (reader.has_full_block()) {
indexer.step<STEP_SIZE>(reader.full_block(), reader);
}
// Take care of the last block (will always be there unless file is empty which is
// not supposed to happen.)
uint8_t block[STEP_SIZE];
if (simdjson_unlikely(reader.get_remainder(block) == 0)) { return UNEXPECTED_ERROR; }
indexer.step<STEP_SIZE>(block, reader);
return indexer.finish(parser, reader.block_index(), len, partial);
}
template<>
simdjson_inline void json_structural_indexer::step<128>(const uint8_t *block, buf_block_reader<128> &reader) noexcept {
simd::simd8x64<uint8_t> in_1(block);
simd::simd8x64<uint8_t> in_2(block+64);
json_block block_1 = scanner.next(in_1);
json_block block_2 = scanner.next(in_2);
this->next(in_1, block_1, reader.block_index());
this->next(in_2, block_2, reader.block_index()+64);
reader.advance();
}
template<>
simdjson_inline void json_structural_indexer::step<64>(const uint8_t *block, buf_block_reader<64> &reader) noexcept {
simd::simd8x64<uint8_t> in_1(block);
json_block block_1 = scanner.next(in_1);
this->next(in_1, block_1, reader.block_index());
reader.advance();
}
simdjson_inline void json_structural_indexer::next(const simd::simd8x64<uint8_t>& in, const json_block& block, size_t idx) {
uint64_t unescaped = in.lteq(0x1F);
#if SIMDJSON_UTF8VALIDATION
checker.check_next_input(in);
#endif
indexer.write(uint32_t(idx-64), prev_structurals); // Output *last* iteration's structurals to the parser
prev_structurals = block.structural_start();
unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}
simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
// Write out the final iteration's structurals
indexer.write(uint32_t(idx-64), prev_structurals);
error_code error = scanner.finish();
// We deliberately break down the next expression so that it is
// human readable.
const bool should_we_exit = is_streaming(partial) ?
((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
: (error != SUCCESS); // if partial is false, we must have SUCCESS
const bool have_unclosed_string = (error == UNCLOSED_STRING);
if (simdjson_unlikely(should_we_exit)) { return error; }
if (unescaped_chars_error) {
return UNESCAPED_CHARS;
}
parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
/***
* The On-Demand API requires special padding.
*
* This is related to https://github.com/simdjson/simdjson/issues/906
* Basically, we want to make sure that if the parsing continues beyond the last (valid)
* structural character, it quickly stops.
* Only three structural characters can be repeated without triggering an error in JSON: [,] and }.
* We repeat the padding character (at 'len'). We don't know what it is, but if the parsing
* continues, then it must be [,] or }.
* Suppose it is ] or }. We backtrack to the first character, what could it be that would
* not trigger an error? It could be ] or } but no, because you can't start a document that way.
* It can't be a comma, a colon or any simple value. So the only way we could continue is
* if the repeated character is [. But if so, the document must start with [. But if the document
* starts with [, it should end with ]. If we enforce that rule, then we would get
* ][[ which is invalid.
*
* This is illustrated with the test array_iterate_unclosed_error() on the following input:
* R"({ "a": [,,)"
**/
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
parser.next_structural_index = 0;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
return EMPTY;
}
if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
return UNEXPECTED_ERROR;
}
if (partial == stage1_mode::streaming_partial) {
// If we have an unclosed string, then the last structural
// will be the quote and we want to make sure to omit it.
if(have_unclosed_string) {
parser.n_structural_indexes--;
// a valid JSON file cannot have zero structural indexes - we should have found something
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
}
// We truncate the input to the end of the last complete document (or zero).
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
if(parser.structural_indexes[0] == 0) {
// If the buffer is partial and we started at index 0 but the document is
// incomplete, it's too big to parse.
return CAPACITY;
} else {
// It is possible that the document could be parsed, we just had a lot
// of white space.
parser.n_structural_indexes = 0;
return EMPTY;
}
}
parser.n_structural_indexes = new_structural_indexes;
} else if (partial == stage1_mode::streaming_final) {
if(have_unclosed_string) { parser.n_structural_indexes--; }
// We truncate the input to the end of the last complete document (or zero).
// Because partial == stage1_mode::streaming_final, it means that we may
// silently ignore trailing garbage. Though it sounds bad, we do it
// deliberately because many people who have streams of JSON documents
// will truncate them for processing. E.g., imagine that you are uncompressing
// the data from a size file or receiving it in chunks from the network. You
// may not know where exactly the last document will be. Meanwhile the
// document_stream instances allow people to know the JSON documents they are
// parsing (see the iterator.source() method).
parser.n_structural_indexes = find_next_document_index(parser);
// We store the initial n_structural_indexes so that the client can see
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
// then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
// otherwise, it will copy some prior index.
parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
// This next line is critical, do not change it unless you understand what you are
// doing.
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
// We tolerate an unclosed string at the very end of the stream. Indeed, users
// often load their data in bulk without being careful and they want us to ignore
// the trailing garbage.
return EMPTY;
}
}
checker.check_eof();
return checker.errors();
}
} // namespace stage1
} // unnamed namespace
} // namespace lasx
} // namespace simdjson
// Clear CUSTOM_BIT_INDEXER so other implementations can set it if they need to.
#undef SIMDJSON_GENERIC_JSON_STRUCTURAL_INDEXER_CUSTOM_BIT_INDEXER
#endif // SIMDJSON_SRC_GENERIC_STAGE1_JSON_STRUCTURAL_INDEXER_H
/* end file generic/stage1/json_structural_indexer.h for lasx */
/* including generic/stage1/utf8_validator.h for lasx: #include <generic/stage1/utf8_validator.h> */
/* begin file generic/stage1/utf8_validator.h for lasx */
#ifndef SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
/* amalgamation skipped (editor-only): #define SIMDJSON_SRC_GENERIC_STAGE1_UTF8_VALIDATOR_H */
/* amalgamation skipped (editor-only): #include <generic/stage1/base.h> */
/* amalgamation skipped (editor-only): #include <generic/stage1/buf_block_reader.h> */
/* amalgamation skipped (editor-only): #include <generic/stage1/utf8_lookup4_algorithm.h> */
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
namespace simdjson {
namespace lasx {
namespace {
namespace stage1 {
/**
* Validates that the string is actual UTF-8.
*/
template<class checker>
bool generic_validate_utf8(const uint8_t * input, size_t length) {
checker c{};
buf_block_reader<64> reader(input, length);
simdjson.cpp view on Meta::CPAN
namespace {
using namespace simd;
simdjson_inline json_character_block json_character_block::classify(const simd::simd8x64<uint8_t>& in) {
// Inspired by haswell.
// LASX use low 5 bits as index. For the 6 operators (:,[]{}), the unique-5bits is [6:2].
// The ASCII white-space and operators have these values: (char, hex, unique-5bits)
// (' ', 20, 00000) ('\t', 09, 01001) ('\n', 0A, 01010) ('\r', 0D, 01101)
// (',', 2C, 01011) (':', 3A, 01110) ('[', 5B, 10110) ('{', 7B, 11110) (']', 5D, 10111) ('}', 7D, 11111)
const simd8<uint8_t> ws_table = simd8<uint8_t>::repeat_16(
' ', 0, 0, 0, 0, 0, 0, 0, 0, '\t', '\n', 0, 0, '\r', 0, 0
);
const simd8<uint8_t> op_table_lo = simd8<uint8_t>::repeat_16(
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ',', 0, 0, ':', 0
);
const simd8<uint8_t> op_table_hi = simd8<uint8_t>::repeat_16(
0, 0, 0, 0, 0, 0, '[', ']', 0, 0, 0, 0, 0, 0, '{', '}'
);
uint64_t ws = in.eq({
in.chunks[0].lookup_16(ws_table),
in.chunks[1].lookup_16(ws_table),
});
uint64_t op = in.eq({
__lasx_xvshuf_b(op_table_hi, op_table_lo, in.chunks[0].shr<2>()),
__lasx_xvshuf_b(op_table_hi, op_table_lo, in.chunks[1].shr<2>()),
});
return { ws, op };
}
simdjson_inline bool is_ascii(const simd8x64<uint8_t>& input) {
return input.reduce_or().is_ascii();
}
simdjson_inline simd8<uint8_t> must_be_2_3_continuation(const simd8<uint8_t> prev2, const simd8<uint8_t> prev3) {
simd8<uint8_t> is_third_byte = prev2.saturating_sub(0xe0u-0x80); // Only 111_____ will be >= 0x80
simd8<uint8_t> is_fourth_byte = prev3.saturating_sub(0xf0u-0x80); // Only 1111____ will be >= 0x80
return is_third_byte | is_fourth_byte;
}
} // unnamed namespace
} // namespace lasx
} // namespace simdjson
//
// Stage 2
//
//
// Implementation-specific overrides
//
namespace simdjson {
namespace lasx {
simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
return lasx::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
this->buf = _buf;
this->len = _len;
return lasx::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
}
simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
return lasx::stage1::generic_validate_utf8(buf,len);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<false>(*this, _doc);
}
simdjson_warn_unused error_code dom_parser_implementation::stage2_next(dom::document &_doc) noexcept {
return stage2::tape_builder::parse_document<true>(*this, _doc);
}
SIMDJSON_NO_SANITIZE_MEMORY
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_string(const uint8_t *src, uint8_t *dst, bool allow_replacement) const noexcept {
return lasx::stringparsing::parse_string(src, dst, allow_replacement);
}
simdjson_warn_unused uint8_t *dom_parser_implementation::parse_wobbly_string(const uint8_t *src, uint8_t *dst) const noexcept {
return lasx::stringparsing::parse_wobbly_string(src, dst);
}
simdjson_warn_unused error_code dom_parser_implementation::parse(const uint8_t *_buf, size_t _len, dom::document &_doc) noexcept {
auto error = stage1(_buf, _len, stage1_mode::regular);
if (error) { return error; }
return stage2(_doc);
}
} // namespace lasx
} // namespace simdjson
/* including simdjson/lasx/end.h: #include <simdjson/lasx/end.h> */
/* begin file simdjson/lasx/end.h */
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
/* amalgamation skipped (editor-only): #include "simdjson/lasx/base.h" */
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
#undef SIMDJSON_SKIP_BACKSLASH_SHORT_CIRCUIT
/* undefining SIMDJSON_IMPLEMENTATION from "lasx" */
#undef SIMDJSON_IMPLEMENTATION
/* end file simdjson/lasx/end.h */
#endif // SIMDJSON_SRC_LASX_CPP
/* end file lasx.cpp */
#endif
#if SIMDJSON_IMPLEMENTATION_FALLBACK
/* including fallback.cpp: #include <fallback.cpp> */
/* begin file fallback.cpp */
#ifndef SIMDJSON_SRC_FALLBACK_CPP
#define SIMDJSON_SRC_FALLBACK_CPP
/* amalgamation skipped (editor-only): #ifndef SIMDJSON_CONDITIONAL_INCLUDE */
/* amalgamation skipped (editor-only): #include <base.h> */
/* amalgamation skipped (editor-only): #endif // SIMDJSON_CONDITIONAL_INCLUDE */
/* including simdjson/fallback.h: #include <simdjson/fallback.h> */
/* begin file simdjson/fallback.h */
#ifndef SIMDJSON_FALLBACK_H
#define SIMDJSON_FALLBACK_H
simdjson.cpp view on Meta::CPAN
#endif // SIMDJSON_SRC_GENERIC_STAGE2_TAPE_BUILDER_H
/* end file generic/stage2/tape_builder.h for fallback */
//
// Stage 1
//
namespace simdjson {
namespace fallback {
simdjson_warn_unused error_code implementation::create_dom_parser_implementation(
size_t capacity,
size_t max_depth,
std::unique_ptr<internal::dom_parser_implementation>& dst
) const noexcept {
dst.reset( new (std::nothrow) fallback::dom_parser_implementation() );
if (!dst) { return MEMALLOC; }
if (auto err = dst->set_capacity(capacity))
return err;
if (auto err = dst->set_max_depth(max_depth))
return err;
return SUCCESS;
}
namespace {
namespace stage1 {
class structural_scanner {
public:
simdjson_inline structural_scanner(dom_parser_implementation &_parser, stage1_mode _partial)
: buf{_parser.buf},
next_structural_index{_parser.structural_indexes.get()},
parser{_parser},
len{static_cast<uint32_t>(_parser.len)},
partial{_partial} {
}
simdjson_inline void add_structural() {
*next_structural_index = idx;
next_structural_index++;
}
simdjson_inline bool is_continuation(uint8_t c) {
return (c & 0xc0) == 0x80;
}
simdjson_inline void validate_utf8_character() {
// Continuation
if (simdjson_unlikely((buf[idx] & 0x40) == 0)) {
// extra continuation
error = UTF8_ERROR;
idx++;
return;
}
// 2-byte
if ((buf[idx] & 0x20) == 0) {
// missing continuation
if (simdjson_unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) {
if (idx+1 > len && is_streaming(partial)) { idx = len; return; }
error = UTF8_ERROR;
idx++;
return;
}
// overlong: 1100000_ 10______
if (buf[idx] <= 0xc1) { error = UTF8_ERROR; }
idx += 2;
return;
}
// 3-byte
if ((buf[idx] & 0x10) == 0) {
// missing continuation
if (simdjson_unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) {
if (idx+2 > len && is_streaming(partial)) { idx = len; return; }
error = UTF8_ERROR;
idx++;
return;
}
// overlong: 11100000 100_____ ________
if (buf[idx] == 0xe0 && buf[idx+1] <= 0x9f) { error = UTF8_ERROR; }
// surrogates: U+D800-U+DFFF 11101101 101_____
if (buf[idx] == 0xed && buf[idx+1] >= 0xa0) { error = UTF8_ERROR; }
idx += 3;
return;
}
// 4-byte
// missing continuation
if (simdjson_unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) {
if (idx+2 > len && is_streaming(partial)) { idx = len; return; }
error = UTF8_ERROR;
idx++;
return;
}
// overlong: 11110000 1000____ ________ ________
if (buf[idx] == 0xf0 && buf[idx+1] <= 0x8f) { error = UTF8_ERROR; }
// too large: > U+10FFFF:
// 11110100 (1001|101_)____
// 1111(1___|011_|0101) 10______
// also includes 5, 6, 7 and 8 byte characters:
// 11111___
if (buf[idx] == 0xf4 && buf[idx+1] >= 0x90) { error = UTF8_ERROR; }
if (buf[idx] >= 0xf5) { error = UTF8_ERROR; }
idx += 4;
}
// Returns true if the string is unclosed.
simdjson_inline bool validate_string() {
idx++; // skip first quote
while (idx < len && buf[idx] != '"') {
if (buf[idx] == '\\') {
idx += 2;
} else if (simdjson_unlikely(buf[idx] & 0x80)) {
validate_utf8_character();
} else {
if (buf[idx] < 0x20) { error = UNESCAPED_CHARS; }
idx++;
}
}
if (idx >= len) { return true; }
return false;
}
simdjson_inline bool is_whitespace_or_operator(uint8_t c) {
switch (c) {
case '{': case '}': case '[': case ']': case ',': case ':':
case ' ': case '\r': case '\n': case '\t':
return true;
default:
return false;
}
}
//
// Parse the entire input in STEP_SIZE-byte chunks.
//
simdjson_inline error_code scan() {
bool unclosed_string = false;
for (;idx<len;idx++) {
switch (buf[idx]) {
// String
case '"':
add_structural();
unclosed_string |= validate_string();
break;
// Operator
case '{': case '}': case '[': case ']': case ',': case ':':
add_structural();
break;
// Whitespace
case ' ': case '\r': case '\n': case '\t':
break;
// Primitive or invalid character (invalid characters will be checked in stage 2)
default:
// Anything else, add the structural and go until we find the next one
add_structural();
while (idx+1<len && !is_whitespace_or_operator(buf[idx+1])) {
idx++;
};
break;
}
}
// We pad beyond.
// https://github.com/simdjson/simdjson/issues/906
// See json_structural_indexer.h for an explanation.
*next_structural_index = len; // assumed later in partial == stage1_mode::streaming_final
next_structural_index[1] = len;
next_structural_index[2] = 0;
parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return EMPTY; }
parser.next_structural_index = 0;
if (partial == stage1_mode::streaming_partial) {
if(unclosed_string) {
parser.n_structural_indexes--;
if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return CAPACITY; }
}
// We truncate the input to the end of the last complete document (or zero).
auto new_structural_indexes = find_next_document_index(parser);
if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
if(parser.structural_indexes[0] == 0) {
// If the buffer is partial and we started at index 0 but the document is
// incomplete, it's too big to parse.
return CAPACITY;
} else {
// It is possible that the document could be parsed, we just had a lot
// of white space.
parser.n_structural_indexes = 0;
return EMPTY;
}
}
parser.n_structural_indexes = new_structural_indexes;
} else if(partial == stage1_mode::streaming_final) {
if(unclosed_string) { parser.n_structural_indexes--; }
// We truncate the input to the end of the last complete document (or zero).
// Because partial == stage1_mode::streaming_final, it means that we may
// silently ignore trailing garbage. Though it sounds bad, we do it
// deliberately because many people who have streams of JSON documents
// will truncate them for processing. E.g., imagine that you are uncompressing
// the data from a size file or receiving it in chunks from the network. You
// may not know where exactly the last document will be. Meanwhile the
// document_stream instances allow people to know the JSON documents they are
// parsing (see the iterator.source() method).
parser.n_structural_indexes = find_next_document_index(parser);
// We store the initial n_structural_indexes so that the client can see
// whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,
// then this will query parser.structural_indexes[parser.n_structural_indexes] which is len,
// otherwise, it will copy some prior index.
parser.structural_indexes[parser.n_structural_indexes + 1] = parser.structural_indexes[parser.n_structural_indexes];
// This next line is critical, do not change it unless you understand what you are
// doing.
parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len);
if (parser.n_structural_indexes == 0) { return EMPTY; }
} else if(unclosed_string) { error = UNCLOSED_STRING; }
return error;
}
private:
const uint8_t *buf;
uint32_t *next_structural_index;
dom_parser_implementation &parser;
uint32_t len;
uint32_t idx{0};
error_code error{SUCCESS};
stage1_mode partial;
}; // structural_scanner
} // namespace stage1
} // unnamed namespace
simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode partial) noexcept {
this->buf = _buf;
this->len = _len;
stage1::structural_scanner scanner(*this, partial);
return scanner.scan();
}
// big table for the minifier
static uint8_t jump_table[256 * 3] = {
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,
1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1,