streaming results from the CPAN

streaming
JavaScript-Embedded
view release on metacpan or search on metacpan
lib/JavaScript/Embedded/C/lib/duktape.c view on Meta::CPAN
 *      of 16-bit code units, and if not, must be conceptually converted to
 *      that format first.  The current lexer processes Unicode code points
 *      and allows characters outside the BMP.  These should be converted to
 *      surrogate pairs while reading the source characters into the window,
 *      not after tokens have been formed (as is done now).  However, the fix
 *      is not trivial because two characters are decoded from one codepoint.
 *
 *    * Optimize for speed as well as size.  Large if-else ladders are (at
 *      least potentially) slow.
 */

/* #include duk_internal.h -> already included */

/*
 *  Various defines and file specific helper macros
 */

#define DUK__MAX_RE_DECESC_DIGITS 9
#define DUK__MAX_RE_QUANT_DIGITS  9 /* Does not allow e.g. 2**31-1, but one more would allow overflows of u32. */

/* whether to use macros or helper function depends on call count */
#define DUK__ISDIGIT(x)    ((x) >= DUK_ASC_0 && (x) <= DUK_ASC_9)
#define DUK__ISHEXDIGIT(x) duk__is_hex_digit((x))
#define DUK__ISOCTDIGIT(x) ((x) >= DUK_ASC_0 && (x) <= DUK_ASC_7)
#define DUK__ISDIGIT03(x)  ((x) >= DUK_ASC_0 && (x) <= DUK_ASC_3)
#define DUK__ISDIGIT47(x)  ((x) >= DUK_ASC_4 && (x) <= DUK_ASC_7)

/* lexer character window helpers */
#define DUK__LOOKUP(lex_ctx, idx)           ((lex_ctx)->window[(idx)].codepoint)
#define DUK__ADVANCECHARS(lex_ctx, count)   duk__advance_chars((lex_ctx), (count))
#define DUK__ADVANCEBYTES(lex_ctx, count)   duk__advance_bytes((lex_ctx), (count))
#define DUK__INITBUFFER(lex_ctx)            duk__initbuffer((lex_ctx))
#define DUK__APPENDBUFFER(lex_ctx, x)       duk__appendbuffer((lex_ctx), (duk_codepoint_t) (x))
#define DUK__APPENDBUFFER_ASCII(lex_ctx, x) duk__appendbuffer_ascii((lex_ctx), (duk_codepoint_t) (x))

/* lookup shorthands (note: assume context variable is named 'lex_ctx') */
#define DUK__L0() DUK__LOOKUP(lex_ctx, 0)
#define DUK__L1() DUK__LOOKUP(lex_ctx, 1)
#define DUK__L2() DUK__LOOKUP(lex_ctx, 2)
#define DUK__L3() DUK__LOOKUP(lex_ctx, 3)
#define DUK__L4() DUK__LOOKUP(lex_ctx, 4)
#define DUK__L5() DUK__LOOKUP(lex_ctx, 5)

/* packed advance/token number macro used by multiple functions */
#define DUK__ADVTOK(advbytes, tok) ((((advbytes) * sizeof(duk_lexer_codepoint)) << 8) + (tok))

/*
 *  Advance lookup window by N characters, filling in new characters as
 *  necessary.  After returning caller is guaranteed a character window of
 *  at least DUK_LEXER_WINDOW_SIZE characters.
 *
 *  The main function duk__advance_bytes() is called at least once per every
 *  token so it has a major lexer/compiler performance impact.  There are two
 *  variants for the main duk__advance_bytes() algorithm: a sliding window
 *  approach which is slightly faster at the cost of larger code footprint,
 *  and a simple copying one.
 *
 *  Decoding directly from the source string would be another lexing option.
 *  But the lookup window based approach has the advantage of hiding the
 *  source string and its encoding effectively which gives more flexibility
 *  going forward to e.g. support chunked streaming of source from flash.
 *
 *  Decodes UTF-8/CESU-8 leniently with support for code points from U+0000 to
 *  U+10FFFF, causing an error if the input is unparseable.  Leniency means:
 *
 *    * Unicode code point validation is intentionally not performed,
 *      except to check that the codepoint does not exceed 0x10ffff.
 *
 *    * In particular, surrogate pairs are allowed and not combined, which
 *      allows source files to represent all SourceCharacters with CESU-8.
 *      Broken surrogate pairs are allowed, as ECMAScript does not mandate
 *      their validation.
 *
 *    * Allow non-shortest UTF-8 encodings.
 *
 *  Leniency here causes few security concerns because all character data is
 *  decoded into Unicode codepoints before lexer processing, and is then
 *  re-encoded into CESU-8.  The source can be parsed as strict UTF-8 with
 *  a compiler option.  However, ECMAScript source characters include -all-
 *  16-bit unsigned integer codepoints, so leniency seems to be appropriate.
 *
 *  Note that codepoints above the BMP are not strictly SourceCharacters,
 *  but the lexer still accepts them as such.  Before ending up in a string
 *  or an identifier name, codepoints above BMP are converted into surrogate
 *  pairs and then CESU-8 encoded, resulting in 16-bit Unicode data as
 *  expected by ECMAScript.
 *
 *  An alternative approach to dealing with invalid or partial sequences
 *  would be to skip them and replace them with e.g. the Unicode replacement
 *  character U+FFFD.  This has limited utility because a replacement character
 *  will most likely cause a parse error, unless it occurs inside a string.
 *  Further, ECMAScript source is typically pure ASCII.
 *
 *  See:
 *
 *     http://en.wikipedia.org/wiki/UTF-8
 *     http://en.wikipedia.org/wiki/CESU-8
 *     http://tools.ietf.org/html/rfc3629
 *     http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
 *
 *  Future work:
 *
 *    * Reject other invalid Unicode sequences (see Wikipedia entry for examples)
 *      in strict UTF-8 mode.
 *
 *    * Size optimize.  An attempt to use a 16-byte lookup table for the first
 *      byte resulted in a code increase though.
 *
 *    * Is checking against maximum 0x10ffff really useful?  4-byte encoding
 *      imposes a certain limit anyway.
 *
 *    * Support chunked streaming of source code.  Can be implemented either
 *      by streaming chunks of bytes or chunks of codepoints.
 */

#if defined(DUK_USE_LEXER_SLIDING_WINDOW)
DUK_LOCAL void duk__fill_lexer_buffer(duk_lexer_ctx *lex_ctx, duk_small_uint_t start_offset_bytes) {
	duk_lexer_codepoint *cp, *cp_end;
	duk_ucodepoint_t x;
	duk_small_uint_t contlen;
	const duk_uint8_t *p, *p_end;
#if defined(DUK_USE_STRICT_UTF8_SOURCE)
	duk_ucodepoint_t mincp;
#endif
	duk_int_t input_line;

	/* Use temporaries and update lex_ctx only when finished. */
	input_line = lex_ctx->input_line;
	p = lex_ctx->input + lex_ctx->input_offset;
	p_end = lex_ctx->input + lex_ctx->input_length;

	cp = (duk_lexer_codepoint *) (void *) ((duk_uint8_t *) lex_ctx->buffer + start_offset_bytes);
	cp_end = lex_ctx->buffer + DUK_LEXER_BUFFER_SIZE;

	for (; cp != cp_end; cp++) {
		cp->offset = (duk_size_t) (p - lex_ctx->input);
		cp->line = input_line;

		/* XXX: potential issue with signed pointers, p_end < p. */
		if (DUK_UNLIKELY(p >= p_end)) {
			/* If input_offset were assigned a negative value, it would
			 * result in a large positive value.  Most likely it would be
			 * larger than input_length and be caught here.  In any case
			 * no memory unsafe behavior would happen.
			 */
			cp->codepoint = -1;
			continue;
		}

		x = (duk_ucodepoint_t) (*p++);

		/* Fast path. */

		if (DUK_LIKELY(x < 0x80UL)) {
			DUK_ASSERT(x != 0x2028UL && x != 0x2029UL); /* not LS/PS */
			if (DUK_UNLIKELY(x <= 0x000dUL)) {
				if ((x == 0x000aUL) || ((x == 0x000dUL) && (p >= p_end || *p != 0x000aUL))) {
					/* lookup for 0x000a above assumes shortest encoding now */

					/* E5 Section 7.3, treat the following as newlines:
					 *   LF
					 *   CR [not followed by LF]
					 *   LS
					 *   PS
					 *
					 * For CR LF, CR is ignored if it is followed by LF, and the LF will bump
					 * the line number.
					 */
					input_line++;
				}
			}

			cp->codepoint = (duk_codepoint_t) x;
( run in 0.833 second using v1.01-cache-2.11-cpan-787462296c9 )