JSON-YY
view release on metacpan or search on metacpan
t/14_unicode.t view on Meta::CPAN
is $rt->{emoji}, $emoji, 'emoji roundtrips';
}
# direct UTF-8 emoji (4-byte)
{
my $json = "{\"e\":\"\xF0\x9F\x98\x80\"}"; # raw UTF-8 bytes for U+1F600
my $data = decode_json($json);
ok defined $data->{e}, 'direct 4-byte UTF-8 decodes';
}
# BOM handling
{
# yyjson should handle or reject BOM
my $json_bom = "\xEF\xBB\xBF{\"a\":1}";
my $data = eval { decode_json($json_bom) };
# yyjson may or may not accept BOM â just verify no crash
ok !$@ || $@ =~ /decode error/, 'BOM handling does not crash';
}
# null bytes in strings
{
my $json = '{"s":"hello\\u0000world"}';
my $data = decode_json($json);
is length($data->{s}), 11, 'null byte in string preserves length';
}
# various unicode escapes
#define MSG_ARR_END "unexpected character, expected ',' or ']'"
#define MSG_OBJ_KEY "unexpected character, expected a string key"
#define MSG_OBJ_SEP "unexpected character, expected ':' after key"
#define MSG_OBJ_END "unexpected character, expected ',' or '}'"
#define MSG_GARBAGE "unexpected content after document"
#define MSG_NOT_END "unexpected end of data"
#define MSG_COMMENT "unclosed multiline comment"
#define MSG_COMMA "trailing comma is not allowed"
#define MSG_NAN_INF "nan or inf number is not allowed"
#define MSG_ERR_TYPE "invalid JSON value type"
#define MSG_ERR_BOM "UTF-8 byte order mark (BOM) is not supported"
#define MSG_ERR_UTF8 "invalid utf-8 encoding in string"
#define MSG_ERR_UTF16 "UTF-16 encoding is not supported"
#define MSG_ERR_UTF32 "UTF-32 encoding is not supported"
/* U64 constant values */
#undef U64_MAX
#define U64_MAX U64(0xFFFFFFFF, 0xFFFFFFFF)
#undef I64_MAX
#define I64_MAX U64(0x7FFFFFFF, 0xFFFFFFFF)
#undef USIZE_MAX
const u8 *cur = (const u8 *)str;
const u8 *end = cur + pos;
if (!str || pos > len) {
if (line) *line = 0;
if (col) *col = 0;
if (chr) *chr = 0;
return false;
}
if (pos >= 3 && is_utf8_bom(cur)) cur += 3; /* don't count BOM */
while (cur < end) {
u8 c = *cur;
chr_sum += 1;
if (likely(c < 0x80)) { /* 0xxxxxxx (0x00-0x7F) ASCII */
if (c == '\n') {
line_sum += 1;
line_pos = chr_sum;
}
cur += 1;
}
/* unclosed multiline comment */
return true;
}
if (code == YYJSON_READ_ERROR_UNEXPECTED_CHARACTER &&
*cur == '/' && cur + 1 == eof) {
/* truncated beginning of comment */
return true;
}
}
if (code == YYJSON_READ_ERROR_UNEXPECTED_CHARACTER &&
has_allow(BOM)) {
/* truncated UTF-8 BOM */
usize len = (usize)(eof - cur);
if (cur == hdr && len < 3 && !memcmp(hdr, "\xEF\xBB\xBF", len)) {
return true;
}
}
return false;
}
hdr = (u8 *)alc.malloc(alc.ctx, len + YYJSON_PADDING_SIZE);
if (unlikely(!hdr)) {
return_err(0, MEMORY_ALLOCATION, MSG_MALLOC);
}
eof = hdr + len;
cur = hdr;
memcpy(hdr, dat, len);
}
memset(eof, 0, YYJSON_PADDING_SIZE);
if (has_allow(BOM)) {
if (len >= 3 && is_utf8_bom(cur)) cur += 3;
}
/* skip empty contents before json document */
if (unlikely(!char_is_ctn(*cur))) {
while (char_is_space(*cur)) cur++;
if (unlikely(!char_is_ctn(*cur))) {
if (has_allow(TRIVIA) && char_is_trivia(*cur)) {
if (!skip_trivia(&cur, eof, flg) && cur == eof) {
return_err(cur - hdr, INVALID_COMMENT, MSG_COMMENT);
} else {
doc = read_root_single(hdr, cur, eof, alc, flg, err);
}
/* check result */
if (likely(doc)) {
memset(err, 0, sizeof(yyjson_read_err));
} else {
/* RFC 8259: JSON text MUST be encoded using UTF-8 */
if (err->pos == 0 && err->code != YYJSON_READ_ERROR_MEMORY_ALLOCATION) {
if (is_utf8_bom(hdr)) err->msg = MSG_ERR_BOM;
else if (len >= 4 && is_utf32_bom(hdr)) err->msg = MSG_ERR_UTF32;
else if (len >= 2 && is_utf16_bom(hdr)) err->msg = MSG_ERR_UTF16;
}
if (!has_flg(INSITU)) alc.free(alc.ctx, hdr);
}
return doc;
#undef return_err
}
};
yyjson_incr_state *yyjson_incr_new(char *buf, size_t buf_len,
yyjson_read_flag flg,
const yyjson_alc *alc_ptr) {
yyjson_incr_state *state = NULL;
yyjson_alc alc = alc_ptr ? *alc_ptr : YYJSON_DEFAULT_ALC;
/* remove non-standard flags */
flg &= ~YYJSON_READ_JSON5;
flg &= ~YYJSON_READ_ALLOW_BOM;
flg &= ~YYJSON_READ_ALLOW_INVALID_UNICODE;
if (unlikely(!buf)) return NULL;
if (unlikely(buf_len >= USIZE_MAX - YYJSON_PADDING_SIZE)) return NULL;
state = (yyjson_incr_state *)alc.malloc(alc.ctx, sizeof(*state));
if (!state) return NULL;
memset(state, 0, sizeof(yyjson_incr_state));
state->alc = alc;
state->flg = flg;
state->buf_len = buf_len;
goto fail_literal_false;
}
if (*cur == 'n') {
if (likely(read_null(&cur, val))) goto doc_end;
goto fail_literal_null;
}
msg = "unexpected character, expected a valid root value";
if (cur == hdr) {
/* RFC 8259: JSON text MUST be encoded using UTF-8 */
if (is_utf8_bom(hdr)) msg = MSG_ERR_BOM;
else if (len >= 4 && is_utf32_bom(hdr)) msg = MSG_ERR_UTF32;
else if (len >= 2 && is_utf16_bom(hdr)) msg = MSG_ERR_UTF16;
}
return_err(cur, UNEXPECTED_CHARACTER, msg);
arr_begin:
/* save current container */
ctn->tag = (((u64)ctn_len + 1) << YYJSON_TAG_BIT) |
(ctn->tag & YYJSON_TAG_MASK);
/** Run-time options for JSON reader. */
typedef uint32_t yyjson_read_flag;
/** Default option (RFC 8259 compliant):
- Read positive integer as uint64_t.
- Read negative integer as int64_t.
- Read floating-point number as double with round-to-nearest mode.
- Read integer which cannot fit in uint64_t or int64_t as double.
- Report error if double number is infinity.
- Report error if string contains invalid UTF-8 character or BOM.
- Report error on trailing commas, comments, inf and nan literals. */
static const yyjson_read_flag YYJSON_READ_NOFLAG = 0;
/** Read the input data in-situ.
This option allows the reader to modify and use input data to store string
values, which can increase reading speed slightly.
The caller should hold the input data before free the document.
The input data must be padded by at least `YYJSON_PADDING_SIZE` bytes.
For example: `[1,2]` should be `[1,2]\0\0\0\0`, input length should be 5. */
static const yyjson_read_flag YYJSON_READ_INSITU = 1 << 0;
option is used, you need to handle these strings carefully to avoid security
risks. */
static const yyjson_read_flag YYJSON_READ_ALLOW_INVALID_UNICODE = 1 << 6;
/** Read big numbers as raw strings. These big numbers include integers that
cannot be represented by `int64_t` and `uint64_t`, and floating-point
numbers that cannot be represented by finite `double`.
The flag will be overridden by `YYJSON_READ_NUMBER_AS_RAW` flag. */
static const yyjson_read_flag YYJSON_READ_BIGNUM_AS_RAW = 1 << 7;
/** Allow UTF-8 BOM and skip it before parsing if any (non-standard). */
static const yyjson_read_flag YYJSON_READ_ALLOW_BOM = 1 << 8;
/** Allow extended number formats (non-standard):
- Hexadecimal numbers, such as `0x7B`.
- Numbers with leading or trailing decimal point, such as `.123`, `123.`.
- Numbers with a leading plus sign, such as `+123`. */
static const yyjson_read_flag YYJSON_READ_ALLOW_EXT_NUMBER = 1 << 9;
/** Allow extended escape sequences in strings (non-standard):
- Additional escapes: `\a`, `\e`, `\v`, ``\'``, `\?`, `\0`.
- Hex escapes: `\xNN`, such as `\x7B`.
#if !defined(YYJSON_DISABLE_READER) || !YYJSON_DISABLE_READER
/**
Read JSON with options.
This function is thread-safe when:
1. The `dat` is not modified by other threads.
2. The `alc` is thread-safe or NULL.
@param dat The JSON data (UTF-8 without BOM), null-terminator is not required.
If this parameter is NULL, the function will fail and return NULL.
The `dat` will not be modified without the flag `YYJSON_READ_INSITU`, so you
can pass a `const char *` string and case it to `char *` if you don't use
the `YYJSON_READ_INSITU` flag.
@param len The length of JSON data in bytes.
If this parameter is 0, the function will fail and return NULL.
@param flg The JSON read options.
Multiple options can be combined with `|` operator. 0 means no options.
@param alc The memory allocator used by JSON reader.
Pass NULL to use the libc's default allocator.
yyjson_api yyjson_doc *yyjson_read_fp(FILE *fp,
yyjson_read_flag flg,
const yyjson_alc *alc,
yyjson_read_err *err);
/**
Read a JSON string.
This function is thread-safe.
@param dat The JSON data (UTF-8 without BOM), null-terminator is not required.
If this parameter is NULL, the function will fail and return NULL.
@param len The length of JSON data in bytes.
If this parameter is 0, the function will fail and return NULL.
@param flg The JSON read options.
Multiple options can be combined with `|` operator. 0 means no options.
@return A new JSON document, or NULL if an error occurs.
When it's no longer needed, it should be freed with `yyjson_doc_free()`.
*/
yyjson_api_inline yyjson_doc *yyjson_read(const char *dat,
size_t len,
if (flg & YYJSON_READ_STOP_WHEN_DONE) len = len < 256 ? 256 : len;
if (len >= (max - pad - mul) / mul) return 0;
return len * mul + pad;
}
/**
Read a JSON number.
This function is thread-safe when data is not modified by other threads.
@param dat The JSON data (UTF-8 without BOM), null-terminator is required.
If this parameter is NULL, the function will fail and return NULL.
@param val The output value where result is stored.
If this parameter is NULL, the function will fail and return NULL.
The value will hold either UINT or SINT or REAL number;
@param flg The JSON read options.
Multiple options can be combined with `|` operator. 0 means no options.
Supports `YYJSON_READ_NUMBER_AS_RAW` and `YYJSON_READ_ALLOW_INF_AND_NAN`.
@param alc The memory allocator used for long number.
It is only used when the built-in floating point reader is disabled.
Pass NULL to use the libc's default allocator.
( run in 1.958 second using v1.01-cache-2.11-cpan-39bf76dae61 )