File-Raw-Separated
view release on metacpan or search on metacpan
include/separated_parser.h view on Meta::CPAN
/*
* separated_parser.h - CSV/TSV parser core for File::Raw::Separated
*
* Plain C99, no Perl dependency. Same parser drives both CSV and TSV;
* defaults differ only in `sep` and `quote`.
*
* Two ways to use:
*
* 1. One-shot: separated_parse(buf, len, opts, row_cb, ud)
* â runs the whole input through the state machine in one call.
*
* 2. Incremental: separated_init() + separated_feed()* + separated_finish()
* â for streaming parsers (mmap chunks, network, etc.).
*
* The callback receives borrowed pointers into the parser's field buffer;
* if the caller needs to keep the field across calls it must copy.
*/
#ifndef SEPARATED_PARSER_H
#define SEPARATED_PARSER_H
#include <stddef.h>
/* ============================================================
* Options
* ============================================================ */
typedef enum {
SEPARATED_EOL_AUTO = 0,
SEPARATED_EOL_LF,
SEPARATED_EOL_CRLF,
SEPARATED_EOL_CR
} separated_eol_t;
typedef struct {
/* Single-byte separator (e.g. ',' or '\t'). */
int sep;
/* Single-byte quote char (e.g. '"'); -1 = no quoting recognised. */
int quote;
/* Single-byte escape char (e.g. '\\'); -1 = doubled-quote escape only. */
int escape;
/* Strict mode: unrecoverable on malformed input.
* Lenient mode (default): best-effort recovery. */
int strict;
/* Line ending mode. AUTO locks to first detected LF/CRLF/CR. */
separated_eol_t eol_mode;
/* Strip leading/trailing ASCII whitespace from unquoted fields only. */
int trim;
/* Empty unquoted field becomes the sentinel SEPARATED_FIELD_NULL
* (length == SEPARATED_FIELD_NULL_LEN, ptr == NULL).
* Quoted empty ("") stays a real zero-length field. */
int empty_is_undef;
/* Skip UTF-8 BOM detection / stripping. */
int binary;
/* Header mode: when set, the dispatcher (XS layer) consumes the
* first emitted row as field names and emits subsequent rows as
* hashrefs keyed by those names. The C parser itself is not
* affected â it always emits fields the same way. */
int header;
/* Maximum field length. 0 = use SEPARATED_FIELD_DEFAULT_CAP. */
size_t max_field_len;
} separated_options_t;
/* Default field cap (16 MiB) when opts.max_field_len == 0. */
#define SEPARATED_FIELD_DEFAULT_CAP (16 * 1024 * 1024)
/* Sentinel returned to callbacks when empty_is_undef + empty unquoted field. */
#define SEPARATED_FIELD_NULL_LEN ((size_t)-1)
/* Initialise an options struct with CSV defaults
* (sep=',', quote='"', strict=0, all other flags 0). */
void separated_options_init_csv(separated_options_t *opts);
/* Initialise an options struct with TSV defaults
* (sep='\t', quote=-1, strict=0, all other flags 0). */
void separated_options_init_tsv(separated_options_t *opts);
/* ============================================================
* Callback signature
* ============================================================
*
* Called once per emitted field (i.e. when the parser knows the field
* is complete because it just saw a separator or row terminator).
*
* field - borrowed pointer into the parser's field buffer
* (NULL when len == SEPARATED_FIELD_NULL_LEN, see opts.empty_is_undef)
* len - length in bytes, or SEPARATED_FIELD_NULL_LEN for "undef"
* end_of_row - 1 if this is the last field of a row, 0 otherwise
* ud - user-data pointer passed to separated_parse / _init
*
* Return 0 to continue, non-zero to abort the parse early.
*/
typedef int (*separated_field_cb)(const char *field, size_t len,
int end_of_row, void *ud);
/* ============================================================
* Errors
* ============================================================ */
typedef enum {
SEPARATED_OK = 0,
SEPARATED_ERR_NOMEM = -1, /* allocation failed */
SEPARATED_ERR_FIELD_TOO_LONG = -2, /* exceeds max_field_len */
SEPARATED_ERR_BAD_QUOTE = -3, /* strict: stray/unbalanced quote */
SEPARATED_ERR_EOL_PINNED = -4, /* strict: eol mode set but mismatch */
SEPARATED_ERR_ABORTED = -5 /* callback returned non-zero */
} separated_err_t;
/* Get human-readable description for an error code. */
const char *separated_strerror(separated_err_t err);
/* ============================================================
* One-shot parse
* ============================================================
*
* Parses the entire buffer in one call.
*
* Returns:
* >= 0 number of rows emitted
* < 0 negative separated_err_t value
*
* On error, *err_offset (if non-NULL) is set to the byte offset within
* `buf` where the error was detected. On success it is set to `len`.
*/
long separated_parse(const char *buf, size_t len,
const separated_options_t *opts,
separated_field_cb cb, void *ud,
size_t *err_offset);
/* ============================================================
* Incremental parse
* ============================================================
*
* For chunked / streaming parsers. Same parser core, just keeps state
* between calls. Use:
*
* separated_ctx_t *ctx = separated_init(opts, cb, ud);
* while (chunk = next chunk) {
* rc = separated_feed(ctx, chunk, chunk_len);
* if (rc < 0) { handle error; break; }
* }
* rc = separated_finish(ctx); // flushes a trailing field/row
* separated_free(ctx);
*
* separated_init copies the options struct internally; the caller may
* free / reuse `opts` after the call.
*/
typedef struct separated_ctx separated_ctx_t;
separated_ctx_t *separated_init(const separated_options_t *opts,
separated_field_cb cb, void *ud);
separated_err_t separated_feed(separated_ctx_t *ctx,
const char *buf, size_t len);
separated_err_t separated_finish(separated_ctx_t *ctx);
void separated_free(separated_ctx_t *ctx);
/* Total bytes seen so far, for error reporting. */
size_t separated_offset(const separated_ctx_t *ctx);
/* Total rows emitted so far. */
size_t separated_rows(const separated_ctx_t *ctx);
#endif /* SEPARATED_PARSER_H */
( run in 0.458 second using v1.01-cache-2.11-cpan-140bd7fdf52 )