unicode results from the CPAN

unicode

Cpanel-JSON-XS

view release on metacpan or search on metacpan

{
    return json->magic == JSON_MAGIC;
}

/* Unpacks the 2 boolean objects from the global references */
INLINE SV *
get_bool (pTHX_ const char *name)
{
  dMY_CXT;
#if PERL_VERSION > 7
  SV *sv = get_sv (name, 1);
#else
  SV *sv = GvSV(gv_fetchpv(name, 1, SVt_PV));
#endif
  SV* rv = SvRV(sv);
  if (!SvOBJECT(sv) || !SvSTASH(sv)) {
    SvREADONLY_off (sv);
    SvREADONLY_off (rv);
    (void)sv_bless(sv, MY_CXT.json_boolean_stash); /* bless the ref */
  }
  SvREADONLY_on (rv);
  SvREADONLY_on (sv);
  return sv;
}

INLINE void
shrink (pTHX_ SV *sv)
{
  /* ignore errors */
  (void)sv_utf8_downgrade (sv, 1);

  if (SvLEN (sv) > SvCUR (sv) + 1)
    {
#ifdef SvPV_shrink_to_cur
      SvPV_shrink_to_cur (sv);
#elif defined (SvPV_renew)
      SvPV_renew (sv, SvCUR (sv) + 1);
#endif
    }
}

/* Decode an utf-8 character and return it, or (UV)-1 in
   case of an error.
   We special-case "safe" characters from U+80 .. U+7FF,
   but use the very good perl function until 5.36 to parse anything else.
   note that we never call this function for an ascii codepoints.
   With 5.36 perl5 removed the API to decode utf8 again with flags for relaxed,
   so we have to hack around this regression again.
*/
INLINE UV
decode_utf8 (pTHX_ unsigned char *s, STRLEN len, int relaxed, STRLEN *clen)
{
  if (LIKELY(len >= 2
             && IN_RANGE_INC (char, s[0], 0xc2, 0xdf)
             && IN_RANGE_INC (char, s[1], 0x80, 0xbf)))
    {
      *clen = 2;
      return ((s[0] & 0x1f) << 6) | (s[1] & 0x3f);
    }
  else {
/* Since perl 5.14 we can disallow surrogates and illegal unicode above
   U+10FFFF.
   Before we could only warn with warnings 'utf8'.
   Surrogates are never allowed for consistency with unpaired escaped surrogate
   handling.
   SUPER, above U+10FFFF is not allowed, unless we are in the relaxed mode.
*/
#if PERL_VERSION > 36
    UV c = utf8n_to_uvchr (s, len, clen,
                          UTF8_CHECK_ONLY | UTF8_DISALLOW_SURROGATE | (relaxed ? 0 : UTF8_DISALLOW_SUPER));
#elif PERL_VERSION > 12
    UV c = utf8n_to_uvuni (s, len, clen,
                           UTF8_CHECK_ONLY | UTF8_DISALLOW_SURROGATE | (relaxed ? 0 : UTF8_DISALLOW_SUPER));
#elif PERL_VERSION >= 8
    UV c = utf8n_to_uvuni (s, len, clen, UTF8_CHECK_ONLY);
#endif
#if PERL_VERSION >= 8 && PERL_VERSION <= 12
    if (c > PERL_UNICODE_MAX && !relaxed)
      *clen = -1;
#endif
#if PERL_VERSION >= 8
    return c;
#else
    /* 5.6 does not detect certain ill-formed sequences, esp. overflows,
       which are security relevant. so we add code to detect these. */
    UV c = utf8_to_uv(s, len, clen, UTF8_CHECK_ONLY);
    if (!relaxed) {
      if (!c || c > PERL_UNICODE_MAX)
        *clen = -1;
      /* need to check manually for some overflows. 5.6 unicode bug */
      else if (len >= 2
               && IN_RANGE_INC (char, s[0], 0xc0, 0xfe)
               && !IN_RANGE_INC (char, s[0], 0xc2, 0xdf)) {
        U8 *s0, *send;
        UV uv = *s;
        UV expectlen = UTF8SKIP(s);

#define UTF_CONTINUATION_MASK           ((U8) ((1U << 6) - 1))
#define UTF_ACCUMULATION_OVERFLOW_MASK                          \
        (((UV) UTF_CONTINUATION_MASK) << ((sizeof(UV) * 8) - 6))

        s0 = s;
        /*printf ("maybe overlong <%.*s> %d/%d %x %x\n", len, s, c,
                  *clen, s[0], s[1]);*/
        if (*clen > 4) {
          *clen = -1;
          return c;
        }
        send = (U8*) s0 + ((expectlen <= len) ? len : len);
        for (s = s0 + 1; s < send; s++) {
          if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
            if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) {
              /*printf ("overflow\n");*/
              *clen = -1;
              return c;
	    }
	    uv = UTF8_ACCUMULATE(uv, *s);
          }
	  else {
            /*printf ("unexpected non continuation\n");*/
            *clen = -1;
            return c;
          }
	}
      }
    }
    return c;
#endif
  }
}

/* Likewise for encoding, also never called for ascii codepoints. */
/* This function takes advantage of this fact, although current gcc's */
/* seem to optimise the check for >= 0x80 away anyways. */
INLINE unsigned char *
encode_utf8 (unsigned char *s, UV ch)
{
  UV uv_ch;
  if    (UNLIKELY(ch < 0x000080))
    *s++ = (unsigned char) ch;
  else if (LIKELY(ch < 0x000800)) {
    uv_ch = 0xc0 | ( ch >>  6);
    *s++ = (unsigned char) uv_ch;
    uv_ch = 0x80 | ( ch        & 0x3f);
    *s++ = (unsigned char) uv_ch;
  }
  else if (ch < 0x010000) {
    uv_ch = 0xe0 | ( ch >> 12);
    *s++ = (unsigned char) uv_ch;
    uv_ch = 0x80 | ((ch >>  6) & 0x3f);

XS.xs view on Meta::CPAN


  if (!SvOBJECT (scalar) && ref_bool_type (aTHX_ scalar) >= 0)
    return 1;

  if (SvOBJECT (scalar) && is_bool_obj (aTHX_ scalar))
    return 1;
  
  return 0;
}

/*/////////////////////////////////////////////////////////////////////////// */
/* encoder */

/* structure used for encoding JSON */
typedef struct
{
  char *cur;  /* SvPVX (sv) + current output position */
  char *end;  /* SvEND (sv) */
  SV *sv;     /* result scalar */
  JSON json;
  JSON *orig_json; /* pointer to original JSON object (for recursion guard) */
  U32 indent; /* indentation level */
  UV limit;   /* escape character values >= this value when encoding */
} enc_t;

INLINE void
need (pTHX_ enc_t *enc, STRLEN len)
{
#if PERL_VERSION > 8 || (PERL_VERSION == 8 && PERL_SUBVERSION >= 1)
  DEBUG_v(Perl_deb(aTHX_ "need enc: %p %p %4ld, want: %lu\n", enc->cur, enc->end,
                   (long)(enc->end - enc->cur), (unsigned long)len));
#endif
  assert(enc->cur <= enc->end);
  if (UNLIKELY(enc->cur + len >= enc->end))
    {
      STRLEN cur = enc->cur - (char *)SvPVX (enc->sv);
      SvGROW (enc->sv, cur + (len < (cur >> 2) ? cur >> 2 : len) + 1);
      enc->cur = SvPVX (enc->sv) + cur;
      enc->end = SvPVX (enc->sv) + SvLEN (enc->sv) - 1;
    }
}

INLINE void
encode_ch (pTHX_ enc_t *enc, char ch)
{
  need (aTHX_ enc, 1);
  *enc->cur++ = ch;
}

static void
encode_str (pTHX_ enc_t *enc, char *str, STRLEN len, int is_utf8)
{
  char *end = str + len;

#if PERL_VERSION < 8
  /* perl5.6 encodes to utf8 automatically, reverse it */
  if (is_utf8 && (enc->json.flags & F_BINARY))
    {
      str = (char *)utf8_to_bytes((U8*)str, &len);
      if (!str)
	croak ("illegal unicode character in binary string", str);
      end = str + len;
    }
#endif
  need (aTHX_ enc, len);

  while (str < end)
    {
      unsigned char ch = *(unsigned char *)str;
#if PERL_VERSION > 8 || (PERL_VERSION == 8 && PERL_SUBVERSION >= 1)
      DEBUG_v(Perl_deb(aTHX_ "str  enc: %p %p %4ld, want: %lu\n", enc->cur, enc->end,
                       (long)(enc->end - enc->cur), (long unsigned)len));
#endif
      if (LIKELY(ch >= 0x20 && ch < 0x80)) /* most common case */
        {
          assert(enc->cur <= enc->end);
          if (UNLIKELY(ch == '"')) /* but with slow exceptions */
            {
              need (aTHX_ enc, 2);
              *enc->cur++ = '\\';
              *enc->cur++ = '"';
              ++len;
            }
          else if (UNLIKELY(ch == '\\'))
            {
              need (aTHX_ enc, 2);
              *enc->cur++ = '\\';
              *enc->cur++ = '\\';
              ++len;
            }
          else if (UNLIKELY(ch == '/' && (enc->json.flags & F_ESCAPE_SLASH)))
            {
              need (aTHX_ enc, 2);
              *enc->cur++ = '\\';
              *enc->cur++ = '/';
              ++len;
            }
          else {
            need (aTHX_ enc, 1);
            *enc->cur++ = ch;
          }

          ++str;
        }
      else
        {
          assert(enc->cur <= enc->end);
          switch (ch)
            {
            case '\010': need (aTHX_ enc, 2);
              *enc->cur++ = '\\'; *enc->cur++ = 'b'; ++len; ++str; break;
            case '\011': need (aTHX_ enc, 2);
              *enc->cur++ = '\\'; *enc->cur++ = 't'; ++len; ++str; break;
            case '\012': need (aTHX_ enc, 2);
              *enc->cur++ = '\\'; *enc->cur++ = 'n'; ++len; ++str; break;
            case '\014': need (aTHX_ enc, 2);
              *enc->cur++ = '\\'; *enc->cur++ = 'f'; ++len; ++str; break;
            case '\015': need (aTHX_ enc, 2);
              *enc->cur++ = '\\'; *enc->cur++ = 'r'; ++len; ++str; break;

            default:
              {
                  STRLEN clen;
                  UV uch;

                  if (is_utf8 && !(enc->json.flags & F_BINARY))
                    {
                      uch = decode_utf8 (aTHX_ (unsigned char *)str, end - str,
                                         enc->json.flags & F_RELAXED, &clen);
                      if (clen == (STRLEN)-1)
                        croak ("malformed or illegal unicode character in string [%.11s], cannot convert to JSON", str);
                    }
                  else
                    {
                      uch = ch;
                      clen = 1;
                    }

                  if (uch < 0x80/*0x20*/ || uch >= enc->limit)
                    {
		      if (enc->json.flags & F_BINARY)
			{
                          /* MB cannot arrive here */
                          need (aTHX_ enc, 4);
                          *enc->cur++ = '\\';
                          *enc->cur++ = 'x';
                          *enc->cur++ = PL_hexdigit [(uch >>  4) & 15];
                          *enc->cur++ = PL_hexdigit [ uch & 15];
                          len += 3;
			}
                      else if (uch >= 0x10000UL)
                        {
                          if (uch >= 0x110000UL)
                            croak ("out of range codepoint (0x%lx) encountered, unrepresentable in JSON", (unsigned long)uch);

                          need (aTHX_ enc, 12);
                          sprintf (enc->cur, "\\u%04x\\u%04x",
                                   (int)((uch - 0x10000) / 0x400 + 0xD800),
                                   (int)((uch - 0x10000) % 0x400 + 0xDC00));
                          enc->cur += 12;
                          len += 11;
                        }
		      else
                        {
                          need (aTHX_ enc, 6);
                          *enc->cur++ = '\\';
                          *enc->cur++ = 'u';
                          *enc->cur++ = PL_hexdigit [ uch >> 12      ];
                          *enc->cur++ = PL_hexdigit [(uch >>  8) & 15];
                          *enc->cur++ = PL_hexdigit [(uch >>  4) & 15];
                          *enc->cur++ = PL_hexdigit [ uch & 15];
                          len += 5;
                        }

                      str += clen;
                    }
                  else if (enc->json.flags & F_LATIN1)
                    {
                      need (aTHX_ enc, 1);
                      *enc->cur++ = (unsigned char)uch;
                      str += clen;
                    }
                  else if (enc->json.flags & F_BINARY)
                    {
                      need (aTHX_ enc, 1);
                      *enc->cur++ = (unsigned char)uch;
                      str += clen;
                    }
                  else if (is_utf8)
                    {
                      need (aTHX_ enc, clen);

XS.xs view on Meta::CPAN

              switch (*dec_cur)
                {
                  case '\\':
                  case '/':
                  case '"': *cur++ = *dec_cur++; break;

                  case 'b': ++dec_cur; *cur++ = '\010'; break;
                  case 't': ++dec_cur; *cur++ = '\011'; break;
                  case 'n': ++dec_cur; *cur++ = '\012'; break;
                  case 'f': ++dec_cur; *cur++ = '\014'; break;
                  case 'r': ++dec_cur; *cur++ = '\015'; break;
                  case '\'':
                    {
                      if( dec->json.flags & F_ALLOW_SQUOTE ) {
                        *cur++ = *dec_cur++;
                      } else {
                        --dec_cur;
                        ERR ("illegal backslash escape sequence in string");
                      }
                      break;
                    }
                  case 'x':
		    {
		      unsigned char c;
		      if (!(dec->json.flags & F_BINARY))
                        ERR ("illegal hex character in non-binary string");
		      ++dec_cur;
                      dec->cur = dec_cur;
                      c = (unsigned char)decode_2hex (dec);
                      if (c == (unsigned char)((UV)-1))
                        goto fail;
		      *cur++ = c;
		      dec_cur += 2;
		      break;
		    }
                  case '0': case '1': case '2': case '3':
		  case '4': case '5': case '6': case '7':
		    {
		      char c;
		      if (!(dec->json.flags & F_BINARY))
                        ERR ("illegal octal character in non-binary string");
                      dec->cur = dec_cur;
                      c = (char)decode_3oct (dec);
                      if (c == (char)-1)
                        goto fail;
		      *cur++ = c;
		      dec_cur += 3;
		      break;
		    }
                  case 'u':
                    {
                      UV lo, hi;
                      ++dec_cur;

                      dec->cur = dec_cur;
                      hi = decode_4hex (dec);
                      dec_cur = dec->cur;
                      if (hi == (UV)-1)
                        goto fail;
		      if (dec->json.flags & F_BINARY)
                        ERR ("illegal unicode character in binary string");

                      /* possibly a surrogate pair */
                      if (hi >= 0xd800) {
                        if (hi < 0xdc00) {
                          if (dec_cur [0] != '\\' || dec_cur [1] != 'u')
                            ERR ("missing low surrogate character in surrogate pair");

                          dec_cur += 2;

                          dec->cur = dec_cur;
                          lo = decode_4hex (dec);
                          dec_cur = dec->cur;
                          if (lo == (UV)-1)
                            goto fail;

                          if (lo < 0xdc00 || lo >= 0xe000)
                            ERR ("surrogate pair expected");

                          hi = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
                          if (UNLIKELY(
                                 !(dec->json.flags & F_RELAXED)
                                 && (((hi & 0xfffe) == 0xfffe)
                                  || ((hi & 0xffff) == 0xffff)))) {
                            WARNER_NONCHAR(hi);
                          }
                        }
                        else if (UNLIKELY(hi < 0xe000)) {
                          ERR ("missing high surrogate character in surrogate pair");
                        }
                        else

/* check 66 noncharacters U+FDD0..U+FDEF, U+FFFE, U+FFFF
   and U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, ... U+10FFFE, U+10FFFF (issue #74)
   and warn as in core.
   See http://www.unicode.org/versions/corrigendum9.html.

   https://www.rfc-editor.org/errata_search.php?rfc=7159&eid=3984
   The WG's consensus was to leave the full range present
   in the ABNF and add the interoperability guidance about
   values outside the Unicode accepted range.
*/
                          if (UNLIKELY(
                                 !(dec->json.flags & F_RELAXED)
                                 && ((hi >= 0xfdd0 && hi <= 0xfdef)
                                  || (hi >= 0xfffe && hi <= 0xffff)))) {
                            WARNER_NONCHAR(hi);
                          }
		      }
                      if (hi >= 0x80)
                        {
                          utf8 = 1;
                          cur = (char*)encode_utf8 ((U8*)cur, hi);
                        }
                      else
                        *cur++ = (unsigned char)hi;
                    }
                    break;

                  default:
                    --dec_cur;
                    ERR ("illegal backslash escape sequence in string");
                }
            }
          else if (LIKELY(ch >= 0x20 && ch < 0x80)) {
            *cur++ = ch;
            /* Ending ' already handled above with (ch == endstr) cid #165321 */
          }
          else if (ch >= 0x80)
            {
              STRLEN clen;

              --dec_cur;

              decode_utf8 (aTHX_ (U8*)dec_cur, dec->end - dec_cur,
                           dec->json.flags & F_RELAXED, &clen);
              if (clen == (STRLEN)-1)
                ERR ("malformed UTF-8 character in JSON string");

              do
                *cur++ = *dec_cur++;
              while (--clen);

              utf8 = 1;
            }
          else if (dec->json.flags & F_RELAXED && ch == '\t') {
            *cur++ = ch;
          } else
            {
              --dec_cur;

              if (!ch)
                ERR ("unexpected end of string while parsing JSON string");
              else
                ERR ("invalid character encountered while parsing JSON string");
            }

( run in 0.996 second using v1.01-cache-2.11-cpan-df04353d9ac )