streaming results from the CPAN

JSON-SIMD

//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
  if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
  // We guard the rest of the code so that we can assume that len > 0 throughout.
  if (len == 0) { return EMPTY; }
  if (is_streaming(partial)) {
    len = trim_partial_utf8(buf, len);
    // If you end up with an empty window after trimming
    // the partial UTF-8 bytes, then chances are good that you
    // have an UTF-8 formatting error.
    if(len == 0) { return UTF8_ERROR; }
  }
  buf_block_reader<STEP_SIZE> reader(buf, len);
  json_structural_indexer indexer(parser.structural_indexes.get());

  // Read all but the last block

simdjson.cpp view on Meta::CPAN

  prev_structurals = block.structural_start();
  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}

simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
  // Write out the final iteration's structurals
  indexer.write(uint32_t(idx-64), prev_structurals);
  error_code error = scanner.finish();
  // We deliberately break down the next expression so that it is
  // human readable.
  const bool should_we_exit = is_streaming(partial) ?
    ((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
    : (error != SUCCESS); // if partial is false, we must have SUCCESS
  const bool have_unclosed_string = (error == UNCLOSED_STRING);
  if (simdjson_unlikely(should_we_exit)) { return error; }

  if (unescaped_chars_error) {
    return UNESCAPED_CHARS;
  }
  parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
  /***

simdjson.cpp view on Meta::CPAN

   * Suppose it is ] or }. We backtrack to the first character, what could it be that would
   * not trigger an error? It could be ] or } but no, because you can't start a document that way.
   * It can't be a comma, a colon or any simple value. So the only way we could continue is
   * if the repeated character is [. But if so, the document must start with [. But if the document
   * starts with [, it should end with ]. If we enforce that rule, then we would get
   * ][[ which is invalid.
   *
   * This is illustrated with the test array_iterate_unclosed_error() on the following input:
   * R"({ "a": [,,)"
   **/
  parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
  parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
  parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
  parser.next_structural_index = 0;
  // a valid JSON file cannot have zero structural indexes - we should have found something
  if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
    return EMPTY;
  }
  if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
    return UNEXPECTED_ERROR;
  }
  if (partial == stage1_mode::streaming_partial) {
    // If we have an unclosed string, then the last structural
    // will be the quote and we want to make sure to omit it.
    if(have_unclosed_string) {
      parser.n_structural_indexes--;
      // a valid JSON file cannot have zero structural indexes - we should have found something
      if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
    }
    // We truncate the input to the end of the last complete document (or zero).
    auto new_structural_indexes = find_next_document_index(parser);
    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {

simdjson.cpp view on Meta::CPAN

        return CAPACITY;
      } else {
        // It is possible that the document could be parsed, we just had a lot
        // of white space.
        parser.n_structural_indexes = 0;
        return EMPTY;
      }
    }

    parser.n_structural_indexes = new_structural_indexes;
  } else if (partial == stage1_mode::streaming_final) {
    if(have_unclosed_string) { parser.n_structural_indexes--; }
    // We truncate the input to the end of the last complete document (or zero).
    // Because partial == stage1_mode::streaming_final, it means that we may
    // silently ignore trailing garbage. Though it sounds bad, we do it
    // deliberately because many people who have streams of JSON documents
    // will truncate them for processing. E.g., imagine that you are uncompressing
    // the data from a size file or receiving it in chunks from the network. You
    // may not know where exactly the last document will be. Meanwhile the
    // document_stream instances allow people to know the JSON documents they are
    // parsing (see the iterator.source() method).
    parser.n_structural_indexes = find_next_document_index(parser);
    // We store the initial n_structural_indexes so that the client can see
    // whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,

simdjson.cpp view on Meta::CPAN

  return find_escaped_branchless(backslash);
}

} // namespace stage1
} // unnamed namespace

simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
  return arm64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
}

simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
  this->buf = _buf;
  this->len = _len;
  return arm64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
}

simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
  return arm64::stage1::generic_validate_utf8(buf,len);
}

simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
  return stage2::tape_builder::parse_document<false>(*this, _doc);
}

simdjson.cpp view on Meta::CPAN

    // extra continuation
    error = UTF8_ERROR;
    idx++;
    return;
  }

  // 2-byte
  if ((buf[idx] & 0x20) == 0) {
    // missing continuation
    if (simdjson_unlikely(idx+1 > len || !is_continuation(buf[idx+1]))) {
      if (idx+1 > len && is_streaming(partial)) { idx = len; return; }
      error = UTF8_ERROR;
      idx++;
      return;
    }
    // overlong: 1100000_ 10______
    if (buf[idx] <= 0xc1) { error = UTF8_ERROR; }
    idx += 2;
    return;
  }

  // 3-byte
  if ((buf[idx] & 0x10) == 0) {
    // missing continuation
    if (simdjson_unlikely(idx+2 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]))) {
      if (idx+2 > len && is_streaming(partial)) { idx = len; return; }
      error = UTF8_ERROR;
      idx++;
      return;
    }
    // overlong: 11100000 100_____ ________
    if (buf[idx] == 0xe0 && buf[idx+1] <= 0x9f) { error = UTF8_ERROR; }
    // surrogates: U+D800-U+DFFF 11101101 101_____
    if (buf[idx] == 0xed && buf[idx+1] >= 0xa0) { error = UTF8_ERROR; }
    idx += 3;
    return;
  }

  // 4-byte
  // missing continuation
  if (simdjson_unlikely(idx+3 > len || !is_continuation(buf[idx+1]) || !is_continuation(buf[idx+2]) || !is_continuation(buf[idx+3]))) {
    if (idx+2 > len && is_streaming(partial)) { idx = len; return; }
    error = UTF8_ERROR;
    idx++;
    return;
  }
  // overlong: 11110000 1000____ ________ ________
  if (buf[idx] == 0xf0 && buf[idx+1] <= 0x8f) { error = UTF8_ERROR; }
  // too large: > U+10FFFF:
  // 11110100 (1001|101_)____
  // 1111(1___|011_|0101) 10______
  // also includes 5, 6, 7 and 8 byte characters:

simdjson.cpp view on Meta::CPAN

        add_structural();
        while (idx+1<len && !is_whitespace_or_operator(buf[idx+1])) {
          idx++;
        };
        break;
    }
  }
  // We pad beyond.
  // https://github.com/simdjson/simdjson/issues/906
  // See json_structural_indexer.h for an explanation.
  *next_structural_index = len; // assumed later in partial == stage1_mode::streaming_final
  next_structural_index[1] = len;
  next_structural_index[2] = 0;
  parser.n_structural_indexes = uint32_t(next_structural_index - parser.structural_indexes.get());
  if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return EMPTY; }
  parser.next_structural_index = 0;
  if (partial == stage1_mode::streaming_partial) {
    if(unclosed_string) {
      parser.n_structural_indexes--;
      if (simdjson_unlikely(parser.n_structural_indexes == 0)) { return CAPACITY; }
    }
    // We truncate the input to the end of the last complete document (or zero).
    auto new_structural_indexes = find_next_document_index(parser);
    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {
      if(parser.structural_indexes[0] == 0) {
        // If the buffer is partial and we started at index 0 but the document is
        // incomplete, it's too big to parse.
        return CAPACITY;
      } else {
        // It is possible that the document could be parsed, we just had a lot
        // of white space.
        parser.n_structural_indexes = 0;
        return EMPTY;
      }
    }
    parser.n_structural_indexes = new_structural_indexes;
  } else if(partial == stage1_mode::streaming_final) {
    if(unclosed_string) { parser.n_structural_indexes--; }
    // We truncate the input to the end of the last complete document (or zero).
    // Because partial == stage1_mode::streaming_final, it means that we may
    // silently ignore trailing garbage. Though it sounds bad, we do it
    // deliberately because many people who have streams of JSON documents
    // will truncate them for processing. E.g., imagine that you are uncompressing
    // the data from a size file or receiving it in chunks from the network. You
    // may not know where exactly the last document will be. Meanwhile the
    // document_stream instances allow people to know the JSON documents they are
    // parsing (see the iterator.source() method).
    parser.n_structural_indexes = find_next_document_index(parser);
    // We store the initial n_structural_indexes so that the client can see
    // whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,

simdjson.cpp view on Meta::CPAN

//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
  if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
  // We guard the rest of the code so that we can assume that len > 0 throughout.
  if (len == 0) { return EMPTY; }
  if (is_streaming(partial)) {
    len = trim_partial_utf8(buf, len);
    // If you end up with an empty window after trimming
    // the partial UTF-8 bytes, then chances are good that you
    // have an UTF-8 formatting error.
    if(len == 0) { return UTF8_ERROR; }
  }
  buf_block_reader<STEP_SIZE> reader(buf, len);
  json_structural_indexer indexer(parser.structural_indexes.get());

  // Read all but the last block

simdjson.cpp view on Meta::CPAN

  prev_structurals = block.structural_start();
  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}

simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
  // Write out the final iteration's structurals
  indexer.write(uint32_t(idx-64), prev_structurals);
  error_code error = scanner.finish();
  // We deliberately break down the next expression so that it is
  // human readable.
  const bool should_we_exit = is_streaming(partial) ?
    ((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
    : (error != SUCCESS); // if partial is false, we must have SUCCESS
  const bool have_unclosed_string = (error == UNCLOSED_STRING);
  if (simdjson_unlikely(should_we_exit)) { return error; }

  if (unescaped_chars_error) {
    return UNESCAPED_CHARS;
  }
  parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
  /***

simdjson.cpp view on Meta::CPAN

   * Suppose it is ] or }. We backtrack to the first character, what could it be that would
   * not trigger an error? It could be ] or } but no, because you can't start a document that way.
   * It can't be a comma, a colon or any simple value. So the only way we could continue is
   * if the repeated character is [. But if so, the document must start with [. But if the document
   * starts with [, it should end with ]. If we enforce that rule, then we would get
   * ][[ which is invalid.
   *
   * This is illustrated with the test array_iterate_unclosed_error() on the following input:
   * R"({ "a": [,,)"
   **/
  parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
  parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
  parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
  parser.next_structural_index = 0;
  // a valid JSON file cannot have zero structural indexes - we should have found something
  if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
    return EMPTY;
  }
  if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
    return UNEXPECTED_ERROR;
  }
  if (partial == stage1_mode::streaming_partial) {
    // If we have an unclosed string, then the last structural
    // will be the quote and we want to make sure to omit it.
    if(have_unclosed_string) {
      parser.n_structural_indexes--;
      // a valid JSON file cannot have zero structural indexes - we should have found something
      if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
    }
    // We truncate the input to the end of the last complete document (or zero).
    auto new_structural_indexes = find_next_document_index(parser);
    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {

simdjson.cpp view on Meta::CPAN

        return CAPACITY;
      } else {
        // It is possible that the document could be parsed, we just had a lot
        // of white space.
        parser.n_structural_indexes = 0;
        return EMPTY;
      }
    }

    parser.n_structural_indexes = new_structural_indexes;
  } else if (partial == stage1_mode::streaming_final) {
    if(have_unclosed_string) { parser.n_structural_indexes--; }
    // We truncate the input to the end of the last complete document (or zero).
    // Because partial == stage1_mode::streaming_final, it means that we may
    // silently ignore trailing garbage. Though it sounds bad, we do it
    // deliberately because many people who have streams of JSON documents
    // will truncate them for processing. E.g., imagine that you are uncompressing
    // the data from a size file or receiving it in chunks from the network. You
    // may not know where exactly the last document will be. Meanwhile the
    // document_stream instances allow people to know the JSON documents they are
    // parsing (see the iterator.source() method).
    parser.n_structural_indexes = find_next_document_index(parser);
    // We store the initial n_structural_indexes so that the client can see
    // whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,

simdjson.cpp view on Meta::CPAN

  return find_escaped_branchless(backslash);
}

} // namespace stage1
} // unnamed namespace

simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
  return icelake::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
}

simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
  this->buf = _buf;
  this->len = _len;
  return icelake::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
}

simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
  return icelake::stage1::generic_validate_utf8(buf,len);
}

simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
  return stage2::tape_builder::parse_document<false>(*this, _doc);
}

simdjson.cpp view on Meta::CPAN

//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
  if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
  // We guard the rest of the code so that we can assume that len > 0 throughout.
  if (len == 0) { return EMPTY; }
  if (is_streaming(partial)) {
    len = trim_partial_utf8(buf, len);
    // If you end up with an empty window after trimming
    // the partial UTF-8 bytes, then chances are good that you
    // have an UTF-8 formatting error.
    if(len == 0) { return UTF8_ERROR; }
  }
  buf_block_reader<STEP_SIZE> reader(buf, len);
  json_structural_indexer indexer(parser.structural_indexes.get());

  // Read all but the last block

simdjson.cpp view on Meta::CPAN

  prev_structurals = block.structural_start();
  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}

simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
  // Write out the final iteration's structurals
  indexer.write(uint32_t(idx-64), prev_structurals);
  error_code error = scanner.finish();
  // We deliberately break down the next expression so that it is
  // human readable.
  const bool should_we_exit = is_streaming(partial) ?
    ((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
    : (error != SUCCESS); // if partial is false, we must have SUCCESS
  const bool have_unclosed_string = (error == UNCLOSED_STRING);
  if (simdjson_unlikely(should_we_exit)) { return error; }

  if (unescaped_chars_error) {
    return UNESCAPED_CHARS;
  }
  parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
  /***

simdjson.cpp view on Meta::CPAN

   * Suppose it is ] or }. We backtrack to the first character, what could it be that would
   * not trigger an error? It could be ] or } but no, because you can't start a document that way.
   * It can't be a comma, a colon or any simple value. So the only way we could continue is
   * if the repeated character is [. But if so, the document must start with [. But if the document
   * starts with [, it should end with ]. If we enforce that rule, then we would get
   * ][[ which is invalid.
   *
   * This is illustrated with the test array_iterate_unclosed_error() on the following input:
   * R"({ "a": [,,)"
   **/
  parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
  parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
  parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
  parser.next_structural_index = 0;
  // a valid JSON file cannot have zero structural indexes - we should have found something
  if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
    return EMPTY;
  }
  if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
    return UNEXPECTED_ERROR;
  }
  if (partial == stage1_mode::streaming_partial) {
    // If we have an unclosed string, then the last structural
    // will be the quote and we want to make sure to omit it.
    if(have_unclosed_string) {
      parser.n_structural_indexes--;
      // a valid JSON file cannot have zero structural indexes - we should have found something
      if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
    }
    // We truncate the input to the end of the last complete document (or zero).
    auto new_structural_indexes = find_next_document_index(parser);
    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {

simdjson.cpp view on Meta::CPAN

        return CAPACITY;
      } else {
        // It is possible that the document could be parsed, we just had a lot
        // of white space.
        parser.n_structural_indexes = 0;
        return EMPTY;
      }
    }

    parser.n_structural_indexes = new_structural_indexes;
  } else if (partial == stage1_mode::streaming_final) {
    if(have_unclosed_string) { parser.n_structural_indexes--; }
    // We truncate the input to the end of the last complete document (or zero).
    // Because partial == stage1_mode::streaming_final, it means that we may
    // silently ignore trailing garbage. Though it sounds bad, we do it
    // deliberately because many people who have streams of JSON documents
    // will truncate them for processing. E.g., imagine that you are uncompressing
    // the data from a size file or receiving it in chunks from the network. You
    // may not know where exactly the last document will be. Meanwhile the
    // document_stream instances allow people to know the JSON documents they are
    // parsing (see the iterator.source() method).
    parser.n_structural_indexes = find_next_document_index(parser);
    // We store the initial n_structural_indexes so that the client can see
    // whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,

simdjson.cpp view on Meta::CPAN

  return find_escaped_branchless(backslash);
}

} // namespace stage1
} // unnamed namespace

simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
  return haswell::stage1::json_minifier::minify<128>(buf, len, dst, dst_len);
}

simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
  this->buf = _buf;
  this->len = _len;
  return haswell::stage1::json_structural_indexer::index<128>(_buf, _len, *this, streaming);
}

simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
  return haswell::stage1::generic_validate_utf8(buf,len);
}

simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
  return stage2::tape_builder::parse_document<false>(*this, _doc);
}

simdjson.cpp view on Meta::CPAN

//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
  if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
  // We guard the rest of the code so that we can assume that len > 0 throughout.
  if (len == 0) { return EMPTY; }
  if (is_streaming(partial)) {
    len = trim_partial_utf8(buf, len);
    // If you end up with an empty window after trimming
    // the partial UTF-8 bytes, then chances are good that you
    // have an UTF-8 formatting error.
    if(len == 0) { return UTF8_ERROR; }
  }
  buf_block_reader<STEP_SIZE> reader(buf, len);
  json_structural_indexer indexer(parser.structural_indexes.get());

  // Read all but the last block

simdjson.cpp view on Meta::CPAN

  prev_structurals = block.structural_start();
  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}

simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
  // Write out the final iteration's structurals
  indexer.write(uint32_t(idx-64), prev_structurals);
  error_code error = scanner.finish();
  // We deliberately break down the next expression so that it is
  // human readable.
  const bool should_we_exit = is_streaming(partial) ?
    ((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
    : (error != SUCCESS); // if partial is false, we must have SUCCESS
  const bool have_unclosed_string = (error == UNCLOSED_STRING);
  if (simdjson_unlikely(should_we_exit)) { return error; }

  if (unescaped_chars_error) {
    return UNESCAPED_CHARS;
  }
  parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
  /***

simdjson.cpp view on Meta::CPAN

   * Suppose it is ] or }. We backtrack to the first character, what could it be that would
   * not trigger an error? It could be ] or } but no, because you can't start a document that way.
   * It can't be a comma, a colon or any simple value. So the only way we could continue is
   * if the repeated character is [. But if so, the document must start with [. But if the document
   * starts with [, it should end with ]. If we enforce that rule, then we would get
   * ][[ which is invalid.
   *
   * This is illustrated with the test array_iterate_unclosed_error() on the following input:
   * R"({ "a": [,,)"
   **/
  parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
  parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
  parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
  parser.next_structural_index = 0;
  // a valid JSON file cannot have zero structural indexes - we should have found something
  if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
    return EMPTY;
  }
  if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
    return UNEXPECTED_ERROR;
  }
  if (partial == stage1_mode::streaming_partial) {
    // If we have an unclosed string, then the last structural
    // will be the quote and we want to make sure to omit it.
    if(have_unclosed_string) {
      parser.n_structural_indexes--;
      // a valid JSON file cannot have zero structural indexes - we should have found something
      if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
    }
    // We truncate the input to the end of the last complete document (or zero).
    auto new_structural_indexes = find_next_document_index(parser);
    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {

simdjson.cpp view on Meta::CPAN

        return CAPACITY;
      } else {
        // It is possible that the document could be parsed, we just had a lot
        // of white space.
        parser.n_structural_indexes = 0;
        return EMPTY;
      }
    }

    parser.n_structural_indexes = new_structural_indexes;
  } else if (partial == stage1_mode::streaming_final) {
    if(have_unclosed_string) { parser.n_structural_indexes--; }
    // We truncate the input to the end of the last complete document (or zero).
    // Because partial == stage1_mode::streaming_final, it means that we may
    // silently ignore trailing garbage. Though it sounds bad, we do it
    // deliberately because many people who have streams of JSON documents
    // will truncate them for processing. E.g., imagine that you are uncompressing
    // the data from a size file or receiving it in chunks from the network. You
    // may not know where exactly the last document will be. Meanwhile the
    // document_stream instances allow people to know the JSON documents they are
    // parsing (see the iterator.source() method).
    parser.n_structural_indexes = find_next_document_index(parser);
    // We store the initial n_structural_indexes so that the client can see
    // whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,

simdjson.cpp view on Meta::CPAN

  return find_escaped_branchless(backslash);
}

} // namespace stage1
} // unnamed namespace

simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
  return ppc64::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
}

simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
  this->buf = _buf;
  this->len = _len;
  return ppc64::stage1::json_structural_indexer::index<64>(buf, len, *this, streaming);
}

simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
  return ppc64::stage1::generic_validate_utf8(buf,len);
}

simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
  return stage2::tape_builder::parse_document<false>(*this, _doc);
}

simdjson.cpp view on Meta::CPAN

//
// The reason we run 2 inputs at a time, is steps 2 and 3 are *still* not enough to soak up all
// available capacity with just one input. Running 2 at a time seems to give the CPU a good enough
// workout.
//
template<size_t STEP_SIZE>
error_code json_structural_indexer::index(const uint8_t *buf, size_t len, dom_parser_implementation &parser, stage1_mode partial) noexcept {
  if (simdjson_unlikely(len > parser.capacity())) { return CAPACITY; }
  // We guard the rest of the code so that we can assume that len > 0 throughout.
  if (len == 0) { return EMPTY; }
  if (is_streaming(partial)) {
    len = trim_partial_utf8(buf, len);
    // If you end up with an empty window after trimming
    // the partial UTF-8 bytes, then chances are good that you
    // have an UTF-8 formatting error.
    if(len == 0) { return UTF8_ERROR; }
  }
  buf_block_reader<STEP_SIZE> reader(buf, len);
  json_structural_indexer indexer(parser.structural_indexes.get());

  // Read all but the last block

simdjson.cpp view on Meta::CPAN

  prev_structurals = block.structural_start();
  unescaped_chars_error |= block.non_quote_inside_string(unescaped);
}

simdjson_inline error_code json_structural_indexer::finish(dom_parser_implementation &parser, size_t idx, size_t len, stage1_mode partial) {
  // Write out the final iteration's structurals
  indexer.write(uint32_t(idx-64), prev_structurals);
  error_code error = scanner.finish();
  // We deliberately break down the next expression so that it is
  // human readable.
  const bool should_we_exit = is_streaming(partial) ?
    ((error != SUCCESS) && (error != UNCLOSED_STRING)) // when partial we tolerate UNCLOSED_STRING
    : (error != SUCCESS); // if partial is false, we must have SUCCESS
  const bool have_unclosed_string = (error == UNCLOSED_STRING);
  if (simdjson_unlikely(should_we_exit)) { return error; }

  if (unescaped_chars_error) {
    return UNESCAPED_CHARS;
  }
  parser.n_structural_indexes = uint32_t(indexer.tail - parser.structural_indexes.get());
  /***

simdjson.cpp view on Meta::CPAN

   * Suppose it is ] or }. We backtrack to the first character, what could it be that would
   * not trigger an error? It could be ] or } but no, because you can't start a document that way.
   * It can't be a comma, a colon or any simple value. So the only way we could continue is
   * if the repeated character is [. But if so, the document must start with [. But if the document
   * starts with [, it should end with ]. If we enforce that rule, then we would get
   * ][[ which is invalid.
   *
   * This is illustrated with the test array_iterate_unclosed_error() on the following input:
   * R"({ "a": [,,)"
   **/
  parser.structural_indexes[parser.n_structural_indexes] = uint32_t(len); // used later in partial == stage1_mode::streaming_final
  parser.structural_indexes[parser.n_structural_indexes + 1] = uint32_t(len);
  parser.structural_indexes[parser.n_structural_indexes + 2] = 0;
  parser.next_structural_index = 0;
  // a valid JSON file cannot have zero structural indexes - we should have found something
  if (simdjson_unlikely(parser.n_structural_indexes == 0u)) {
    return EMPTY;
  }
  if (simdjson_unlikely(parser.structural_indexes[parser.n_structural_indexes - 1] > len)) {
    return UNEXPECTED_ERROR;
  }
  if (partial == stage1_mode::streaming_partial) {
    // If we have an unclosed string, then the last structural
    // will be the quote and we want to make sure to omit it.
    if(have_unclosed_string) {
      parser.n_structural_indexes--;
      // a valid JSON file cannot have zero structural indexes - we should have found something
      if (simdjson_unlikely(parser.n_structural_indexes == 0u)) { return CAPACITY; }
    }
    // We truncate the input to the end of the last complete document (or zero).
    auto new_structural_indexes = find_next_document_index(parser);
    if (new_structural_indexes == 0 && parser.n_structural_indexes > 0) {

simdjson.cpp view on Meta::CPAN

        return CAPACITY;
      } else {
        // It is possible that the document could be parsed, we just had a lot
        // of white space.
        parser.n_structural_indexes = 0;
        return EMPTY;
      }
    }

    parser.n_structural_indexes = new_structural_indexes;
  } else if (partial == stage1_mode::streaming_final) {
    if(have_unclosed_string) { parser.n_structural_indexes--; }
    // We truncate the input to the end of the last complete document (or zero).
    // Because partial == stage1_mode::streaming_final, it means that we may
    // silently ignore trailing garbage. Though it sounds bad, we do it
    // deliberately because many people who have streams of JSON documents
    // will truncate them for processing. E.g., imagine that you are uncompressing
    // the data from a size file or receiving it in chunks from the network. You
    // may not know where exactly the last document will be. Meanwhile the
    // document_stream instances allow people to know the JSON documents they are
    // parsing (see the iterator.source() method).
    parser.n_structural_indexes = find_next_document_index(parser);
    // We store the initial n_structural_indexes so that the client can see
    // whether we used truncation. If initial_n_structural_indexes == parser.n_structural_indexes,

simdjson.cpp view on Meta::CPAN

  return find_escaped_branchless(backslash);
}

} // namespace stage1
} // unnamed namespace

simdjson_warn_unused error_code implementation::minify(const uint8_t *buf, size_t len, uint8_t *dst, size_t &dst_len) const noexcept {
  return westmere::stage1::json_minifier::minify<64>(buf, len, dst, dst_len);
}

simdjson_warn_unused error_code dom_parser_implementation::stage1(const uint8_t *_buf, size_t _len, stage1_mode streaming) noexcept {
  this->buf = _buf;
  this->len = _len;
  return westmere::stage1::json_structural_indexer::index<64>(_buf, _len, *this, streaming);
}

simdjson_warn_unused bool implementation::validate_utf8(const char *buf, size_t len) const noexcept {
  return westmere::stage1::generic_validate_utf8(buf,len);
}

simdjson_warn_unused error_code dom_parser_implementation::stage2(dom::document &_doc) noexcept {
  return stage2::tape_builder::parse_document<false>(*this, _doc);
}

simdjson.h view on Meta::CPAN


namespace simdjson {

namespace dom {
class document;
} // namespace dom

/**
* This enum is used with the dom_parser_implementation::stage1 function.
* 1) The regular mode expects a fully formed JSON document.
* 2) The streaming_partial mode expects a possibly truncated
* input within a stream on JSON documents.
* 3) The stream_final mode allows us to truncate final
* unterminated strings. It is useful in conjunction with streaming_partial.
*/
enum class stage1_mode { regular, streaming_partial, streaming_final};

/**
 * Returns true if mode == streaming_partial or mode == streaming_final
 */
inline bool is_streaming(stage1_mode mode) {
  // performance note: it is probably faster to check that mode is different
  // from regular than checking that it is either streaming_partial or streaming_final.
  return (mode != stage1_mode::regular);
  // return (mode == stage1_mode::streaming_partial || mode == stage1_mode::streaming_final);
}


namespace internal {


/**
 * An implementation of simdjson's DOM parser for a particular CPU architecture.
 *
 * This class is expected to be accessed only by pointer, and never move in memory (though the

simdjson.h view on Meta::CPAN

   * @private For internal implementation use
   *
   * Stage 1 of the document parser.
   *
   * Guaranteed only to be called when capacity > document length.
   *
   * Overridden by each implementation.
   *
   * @param buf The json document to parse.
   * @param len The length of the json document.
   * @param streaming Whether this is being called by parser::parse_many.
   * @return The error code, or SUCCESS if there was no error.
   */
  simdjson_warn_unused virtual error_code stage1(const uint8_t *buf, size_t len, stage1_mode streaming) noexcept = 0;

  /**
   * @private For internal implementation use
   *
   * Stage 2 of the document parser.
   *
   * Called after stage1().
   *
   * Overridden by each implementation.
   *

simdjson.h view on Meta::CPAN

   */
  inline size_t size_in_bytes() const noexcept;
  /**
   * After iterating through the stream, this method
   * returns the number of bytes that were not parsed at the end
   * of the stream. If truncated_bytes() differs from zero,
   * then the input was truncated maybe because incomplete JSON
   * documents were found at the end of the stream. You
   * may need to process the bytes in the interval [size_in_bytes()-truncated_bytes(), size_in_bytes()).
   *
   * You should only call truncated_bytes() after streaming through all
   * documents, like so:
   *
   *   document_stream stream = parser.parse_many(json,window);
   *   for(auto doc : stream) {
   *      // do something with doc
   *   }
   *   size_t truncated = stream.truncated_bytes();
   *
   */
  inline size_t truncated_bytes() const noexcept;

simdjson.h view on Meta::CPAN

  return parser->implementation->structural_indexes[parser->implementation->n_structural_indexes] - parser->implementation->structural_indexes[parser->implementation->n_structural_indexes + 1];
}

inline size_t document_stream::next_batch_start() const noexcept {
  return batch_start + parser->implementation->structural_indexes[parser->implementation->n_structural_indexes];
}

inline error_code document_stream::run_stage1(dom::parser &p, size_t _batch_start) noexcept {
  size_t remaining = len - _batch_start;
  if (remaining <= batch_size) {
    return p.implementation->stage1(&buf[_batch_start], remaining, stage1_mode::streaming_final);
  } else {
    return p.implementation->stage1(&buf[_batch_start], batch_size, stage1_mode::streaming_partial);
  }
}

#ifdef SIMDJSON_THREADS_ENABLED

inline void document_stream::load_from_stage1_thread() noexcept {
  worker->finish();
  // Swap to the parser that was loaded up in the thread. Make sure the parser has
  // enough memory to swap to, as well.
  std::swap(*parser, stage1_thread_parser);

simdjson.h view on Meta::CPAN

   *
   * - 0 = finished with document
   * - 1 = document root value (could be [ or {, not yet known)
   * - 2 = , or } inside root array/object
   * - 3 = key or value inside root array/object.
   */
  depth_t _depth{};
  /**
   * Beginning of the document indexes.
   * Normally we have root == parser->implementation->structural_indexes.get()
   * but this may differ, especially in streaming mode (where we have several
   * documents);
   */
  token_position _root{};
  /**
   * Normally, a json_iterator operates over a single document, but in
   * some cases, we may have a stream of documents. This attribute is meant
   * as meta-data: the json_iterator works the same irrespective of the
   * value of this attribute.
   */
  bool _streaming{false};

public:
  simdjson_inline json_iterator() noexcept = default;
  simdjson_inline json_iterator(json_iterator &&other) noexcept;
  simdjson_inline json_iterator &operator=(json_iterator &&other) noexcept;
  simdjson_inline explicit json_iterator(const json_iterator &other) noexcept = default;
  simdjson_inline json_iterator &operator=(const json_iterator &other) noexcept = default;
  /**
   * Skips a JSON value, whether it is a scalar, array or object.
   */
  simdjson_warn_unused simdjson_inline error_code skip_child(depth_t parent_depth) noexcept;

  /**
   * Tell whether the iterator is still at the start
   */
  simdjson_inline bool at_root() const noexcept;

  /**
   * Tell whether we should be expected to run in streaming
   * mode (iterating over many documents). It is pure metadata
   * that does not affect how the iterator works. It is used by
   * start_root_array() and start_root_object().
   */
  simdjson_inline bool streaming() const noexcept;

  /**
   * Get the root value iterator
   */
  simdjson_inline token_position root_position() const noexcept;
  /**
   * Assert that we are at the document depth (== 1)
   */
  simdjson_inline void assert_at_document_depth() const noexcept;
  /**

simdjson.h view on Meta::CPAN

  inline size_t size_in_bytes() const noexcept;

  /**
   * After iterating through the stream, this method
   * returns the number of bytes that were not parsed at the end
   * of the stream. If truncated_bytes() differs from zero,
   * then the input was truncated maybe because incomplete JSON
   * documents were found at the end of the stream. You
   * may need to process the bytes in the interval [size_in_bytes()-truncated_bytes(), size_in_bytes()).
   *
   * You should only call truncated_bytes() after streaming through all
   * documents, like so:
   *
   *   document_stream stream = parser.iterate_many(json,window);
   *   for(auto & doc : stream) {
   *      // do something with doc
   *   }
   *   size_t truncated = stream.truncated_bytes();
   *
   */
  inline size_t truncated_bytes() const noexcept;

simdjson.h view on Meta::CPAN

namespace SIMDJSON_BUILTIN_IMPLEMENTATION {
namespace ondemand {

simdjson_inline json_iterator::json_iterator(json_iterator &&other) noexcept
  : token(std::forward<token_iterator>(other.token)),
    parser{other.parser},
    _string_buf_loc{other._string_buf_loc},
    error{other.error},
    _depth{other._depth},
    _root{other._root},
    _streaming{other._streaming}
{
  other.parser = nullptr;
}
simdjson_inline json_iterator &json_iterator::operator=(json_iterator &&other) noexcept {
  token = other.token;
  parser = other.parser;
  _string_buf_loc = other._string_buf_loc;
  error = other.error;
  _depth = other._depth;
  _root = other._root;
  _streaming = other._streaming;
  other.parser = nullptr;
  return *this;
}

simdjson_inline json_iterator::json_iterator(const uint8_t *buf, ondemand::parser *_parser) noexcept
  : token(buf, &_parser->implementation->structural_indexes[0]),
    parser{_parser},
    _string_buf_loc{parser->string_buf.get()},
    _depth{1},
    _root{parser->implementation->structural_indexes.get()},
    _streaming{false}

{
  logger::log_headers();
#if SIMDJSON_CHECK_EOF
  assert_more_tokens();
#endif
}

inline void json_iterator::rewind() noexcept {
  token.set_position( root_position() );

simdjson.h view on Meta::CPAN

SIMDJSON_POP_DISABLE_WARNINGS

simdjson_inline bool json_iterator::at_root() const noexcept {
  return position() == root_position();
}

simdjson_inline bool json_iterator::is_single_token() const noexcept {
  return parser->implementation->n_structural_indexes == 1;
}

simdjson_inline bool json_iterator::streaming() const noexcept {
  return _streaming;
}

simdjson_inline token_position json_iterator::root_position() const noexcept {
  return _root;
}

simdjson_inline void json_iterator::assert_at_document_depth() const noexcept {
  SIMDJSON_ASSUME( _depth == 1 );
}

simdjson.h view on Meta::CPAN

  if (*_json_iter->peek() == '}') {
    logger::log_value(*_json_iter, "empty object");
    _json_iter->return_current_and_advance();
    end_container();
    return false;
  }
  return true;
}

simdjson_warn_unused simdjson_inline error_code value_iterator::check_root_object() noexcept {
  // When in streaming mode, we cannot expect peek_last() to be the last structural element of the
  // current document. It only works in the normal mode where we have indexed a single document.
  // Note that adding a check for 'streaming' is not expensive since we only have at most
  // one root element.
  if ( ! _json_iter->streaming() ) {
    // The following lines do not fully protect against garbage content within the
    // object: e.g., `{"a":2} foo }`. Users concerned with garbage content should
    // call `at_end()` on the document instance at the end of the processing to
    // ensure that the processing has finished at the end.
    //
    if (*_json_iter->peek_last() != '}') {
      _json_iter->abandon();
      return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "missing } at end");
    }
    // If the last character is } *and* the first gibberish character is also '}'

simdjson.h view on Meta::CPAN

    return false;
  }
  _json_iter->descend_to(depth()+1);
#if SIMDJSON_DEVELOPMENT_CHECKS
  _json_iter->set_start_position(_depth, start_position());
#endif
  return true;
}

simdjson_warn_unused simdjson_inline error_code value_iterator::check_root_array() noexcept {
  // When in streaming mode, we cannot expect peek_last() to be the last structural element of the
  // current document. It only works in the normal mode where we have indexed a single document.
  // Note that adding a check for 'streaming' is not expensive since we only have at most
  // one root element.
  if ( ! _json_iter->streaming() ) {
    // The following lines do not fully protect against garbage content within the
    // array: e.g., `[1, 2] foo]`. Users concerned with garbage content should
    // also call `at_end()` on the document instance at the end of the processing to
    // ensure that the processing has finished at the end.
    //
    if (*_json_iter->peek_last() != ']') {
      _json_iter->abandon();
      return report_error(INCOMPLETE_ARRAY_OR_OBJECT, "missing ] at end");
    }
    // If the last character is ] *and* the first gibberish character is also ']'

simdjson.h view on Meta::CPAN

  error = run_stage1(*parser, batch_start);
  while(error == EMPTY) {
    // In exceptional cases, we may start with an empty block
    batch_start = next_batch_start();
    if (batch_start >= len) { return; }
    error = run_stage1(*parser, batch_start);
  }
  if (error) { return; }
  doc_index = batch_start;
  doc = document(json_iterator(&buf[batch_start], parser));
  doc.iter._streaming = true;

  #ifdef SIMDJSON_THREADS_ENABLED
  if (use_thread && next_batch_start() < len) {
    // Kick off the first thread on next batch if needed
    error = stage1_thread_parser.allocate(batch_size);
    if (error) { return; }
    worker->start_thread();
    start_stage1_thread();
    if (error) { return; }
  }

simdjson.h view on Meta::CPAN

       *
       * That is, stage1 calls "this->buf = _buf" so the parser remembers the buffer that
       * we used. But json_iterator has no callback when stage1 is called on the parser.
       * In fact, I think that the parser is unaware of json_iterator.
       *
       *
       * So we need to re-anchor the json_iterator after each call to stage 1 so that
       * all of the pointers are in sync.
       */
      doc.iter = json_iterator(&buf[batch_start], parser);
      doc.iter._streaming = true;
      /**
       * End of resync.
       */

      if (error) { continue; } // If the error was EMPTY, we may want to load another batch.
      doc_index = batch_start;
    }
  }
}

simdjson.h view on Meta::CPAN


inline size_t document_stream::next_batch_start() const noexcept {
  return batch_start + parser->implementation->structural_indexes[parser->implementation->n_structural_indexes];
}

inline error_code document_stream::run_stage1(ondemand::parser &p, size_t _batch_start) noexcept {
  // This code only updates the structural index in the parser, it does not update any json_iterator
  // instance.
  size_t remaining = len - _batch_start;
  if (remaining <= batch_size) {
    return p.implementation->stage1(&buf[_batch_start], remaining, stage1_mode::streaming_final);
  } else {
    return p.implementation->stage1(&buf[_batch_start], batch_size, stage1_mode::streaming_partial);
  }
}

simdjson_inline size_t document_stream::iterator::current_index() const noexcept {
  return stream->doc_index;
}

simdjson_inline std::string_view document_stream::iterator::source() const noexcept {
  auto depth = stream->doc.iter.depth();
  auto cur_struct_index = stream->doc.iter._root - stream->parser->implementation->structural_indexes.get();

( run in 0.234 second using v1.01-cache-2.11-cpan-a5abf4f5562 )