Text-ClearSilver
view release on metacpan or search on metacpan
cs/util/regex/regex.c view on Meta::CPAN
Copyright (C) 1993, 1994, 1995, 1996 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
USA. */
/* AIX requires this to be the first thing in the file. */
#if defined (_AIX) && !defined (REGEX_MALLOC)
#pragma alloca
#endif
#undef _GNU_SOURCE
#define _GNU_SOURCE
#include "cs_config.h"
#include "util/osdep.h"
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif
/* We need this for `regex.h', and perhaps for the Emacs include files. */
#include <sys/types.h>
/* This is for other GNU distributions with internationalized messages. */
#if HAVE_LIBINTL_H || defined (_LIBC)
# include <libintl.h>
#else
# define gettext(msgid) (msgid)
#endif
#ifndef gettext_noop
/* This define is so xgettext can find the internationalizable
strings. */
#define gettext_noop(String) String
#endif
/* The `emacs' switch turns on certain matching commands
that make sense only in Emacs. */
#ifdef emacs
#include "lisp.h"
#include "buffer.h"
#include "syntax.h"
#else /* not emacs */
/* If we are not linking with Emacs proper,
we can't use the relocating allocator
even if config.h says that we can. */
#undef REL_ALLOC
#if defined (STDC_HEADERS) || defined (_LIBC)
#include <stdlib.h>
#else
char *malloc ();
char *realloc ();
#endif
/* When used in Emacs's lib-src, we need to get bzero and bcopy somehow.
If nothing else has been done, use the method below. */
#ifdef INHIBIT_STRING_HEADER
#if !(defined (HAVE_BZERO) && defined (HAVE_BCOPY))
#if !defined (bzero) && !defined (bcopy)
#undef INHIBIT_STRING_HEADER
#endif
#endif
#endif
/* This is the normal way of making sure we have a bcopy and a bzero.
This is used in most programs--a few other programs avoid this
by defining INHIBIT_STRING_HEADER. */
#ifndef INHIBIT_STRING_HEADER
#if defined (HAVE_STRING_H) || defined (STDC_HEADERS) || defined (_LIBC)
#include <string.h>
#ifndef bcmp
#define bcmp(s1, s2, n) memcmp ((s1), (s2), (n))
#endif
#ifndef bcopy
#define bcopy(s, d, n) memcpy ((d), (s), (n))
#endif
#ifndef bzero
#define bzero(s, n) memset ((s), 0, (n))
#endif
#else
#include <strings.h>
#endif
#endif
/* Define the syntax stuff for \<, \>, etc. */
/* This must be nonzero for the wordchar and notwordchar pattern
commands in re_match_2. */
#ifndef Sword
#define Sword 1
#endif
#ifdef SWITCH_ENUM_BUG
#define SWITCH_ENUM_CAST(x) ((int)(x))
#else
#define SWITCH_ENUM_CAST(x) (x)
#endif
#ifdef SYNTAX_TABLE
extern char *re_syntax_table;
#else /* not SYNTAX_TABLE */
cs/util/regex/regex.c view on Meta::CPAN
re_search* or re_match* could cause memory leaks when C-g is used in
Emacs; also, malloc is slower and causes storage fragmentation. On
the other hand, malloc is more portable, and easier to debug.
Because we sometimes use alloca, some routines have to be macros,
not functions -- `alloca'-allocated space disappears at the end of the
function it is called in. */
#ifdef REGEX_MALLOC
#define REGEX_ALLOCATE malloc
#define REGEX_REALLOCATE(source, osize, nsize) realloc (source, nsize)
#define REGEX_FREE free
#else /* not REGEX_MALLOC */
/* Emacs already defines alloca, sometimes. */
#ifndef alloca
/* Make alloca work the best possible way. */
#ifdef __GNUC__
#define alloca __builtin_alloca
#else /* not __GNUC__ */
#if HAVE_ALLOCA_H
#include <alloca.h>
#else /* not __GNUC__ or HAVE_ALLOCA_H */
#if 0 /* It is a bad idea to declare alloca. We always cast the result. */
#ifndef _AIX /* Already did AIX, up at the top. */
char *alloca ();
#endif /* not _AIX */
#endif
#endif /* not HAVE_ALLOCA_H */
#endif /* not __GNUC__ */
#endif /* not alloca */
#define REGEX_ALLOCATE alloca
/* Assumes a `char *destination' variable. */
#define REGEX_REALLOCATE(source, osize, nsize) \
(destination = (char *) alloca (nsize), \
bcopy (source, destination, osize), \
destination)
/* No need to do anything to free, after alloca. */
#define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
#endif /* not REGEX_MALLOC */
/* Define how to allocate the failure stack. */
#if defined (REL_ALLOC) && defined (REGEX_MALLOC)
#define REGEX_ALLOCATE_STACK(size) \
r_alloc (&failure_stack_ptr, (size))
#define REGEX_REALLOCATE_STACK(source, osize, nsize) \
r_re_alloc (&failure_stack_ptr, (nsize))
#define REGEX_FREE_STACK(ptr) \
r_alloc_free (&failure_stack_ptr)
#else /* not using relocating allocator */
#ifdef REGEX_MALLOC
#define REGEX_ALLOCATE_STACK malloc
#define REGEX_REALLOCATE_STACK(source, osize, nsize) realloc (source, nsize)
#define REGEX_FREE_STACK free
#else /* not REGEX_MALLOC */
#define REGEX_ALLOCATE_STACK alloca
#define REGEX_REALLOCATE_STACK(source, osize, nsize) \
REGEX_REALLOCATE (source, osize, nsize)
/* No need to explicitly free anything. */
#define REGEX_FREE_STACK(arg)
#endif /* not REGEX_MALLOC */
#endif /* not using relocating allocator */
/* True if `size1' is non-NULL and PTR is pointing anywhere inside
`string1' or just past its end. This works if PTR is NULL, which is
a good thing. */
#define FIRST_STRING_P(ptr) \
(size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
/* (Re)Allocate N items of type T using malloc, or fail. */
#define TALLOC(n, t) ((t *) malloc ((n) * sizeof (t)))
#define RETALLOC(addr, n, t) ((addr) = (t *) realloc (addr, (n) * sizeof (t)))
#define RETALLOC_IF(addr, n, t) \
if (addr) RETALLOC((addr), (n), t); else (addr) = TALLOC ((n), t)
#define REGEX_TALLOC(n, t) ((t *) REGEX_ALLOCATE ((n) * sizeof (t)))
#define BYTEWIDTH 8 /* In bits. */
#define STREQ(s1, s2) ((strcmp (s1, s2) == 0))
#undef MAX
#undef MIN
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
typedef char boolean;
#define false 0
#define true 1
static int re_match_2_internal ();
/* These are the command codes that appear in compiled regular
expressions. Some opcodes are followed by argument bytes. A
command code can specify any interpretation whatsoever for its
arguments. Zero bytes may appear in the compiled regular expression. */
typedef enum
{
no_op = 0,
/* Succeed right away--no more backtracking. */
succeed,
/* Followed by one byte giving n, then by n literal bytes. */
exactn,
/* Matches any (more or less) character. */
anychar,
/* Matches any one char belonging to specified set. First
following byte is number of bitmap bytes. Then come bytes
for a bitmap saying which chars are in. Bits in each byte
are ordered low-bit-first. A character is in the set if its
bit is 1. A character too large to have a bit in the map is
automatically not in the set. */
charset,
/* Same parameters as charset, but match any character that is
not one of those specified. */
charset_not,
cs/util/regex/regex.c view on Meta::CPAN
#define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
#endif /* not DEBUG */
/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
also be assigned to arbitrarily: each pattern buffer stores its own
syntax, so it can be changed between regex compilations. */
/* This has no initializer because initialized variables in Emacs
become read-only after dumping. */
reg_syntax_t re_syntax_options;
/* Specify the precise syntax of regexps for compilation. This provides
for compatibility for various utilities which historically have
different, incompatible syntaxes.
The argument SYNTAX is a bit mask comprised of the various bits
defined in regex.h. We return the old syntax. */
reg_syntax_t
re_set_syntax (syntax)
reg_syntax_t syntax;
{
reg_syntax_t ret = re_syntax_options;
re_syntax_options = syntax;
return ret;
}
/* This table gives an error message for each of the error codes listed
in regex.h. Obviously the order here has to be same as there.
POSIX doesn't require that we do anything for REG_NOERROR,
but why not be nice? */
static const char *re_error_msgid[] =
{
gettext_noop ("Success"), /* REG_NOERROR */
gettext_noop ("No match"), /* REG_NOMATCH */
gettext_noop ("Invalid regular expression"), /* REG_BADPAT */
gettext_noop ("Invalid collation character"), /* REG_ECOLLATE */
gettext_noop ("Invalid character class name"), /* REG_ECTYPE */
gettext_noop ("Trailing backslash"), /* REG_EESCAPE */
gettext_noop ("Invalid back reference"), /* REG_ESUBREG */
gettext_noop ("Unmatched [ or [^"), /* REG_EBRACK */
gettext_noop ("Unmatched ( or \\("), /* REG_EPAREN */
gettext_noop ("Unmatched \\{"), /* REG_EBRACE */
gettext_noop ("Invalid content of \\{\\}"), /* REG_BADBR */
gettext_noop ("Invalid range end"), /* REG_ERANGE */
gettext_noop ("Memory exhausted"), /* REG_ESPACE */
gettext_noop ("Invalid preceding regular expression"), /* REG_BADRPT */
gettext_noop ("Premature end of regular expression"), /* REG_EEND */
gettext_noop ("Regular expression too big"), /* REG_ESIZE */
gettext_noop ("Unmatched ) or \\)"), /* REG_ERPAREN */
};
/* Avoiding alloca during matching, to placate r_alloc. */
/* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
searching and matching functions should not call alloca. On some
systems, alloca is implemented in terms of malloc, and if we're
using the relocating allocator routines, then malloc could cause a
relocation, which might (if the strings being searched are in the
ralloc heap) shift the data out from underneath the regexp
routines.
Here's another reason to avoid allocation: Emacs
processes input from X in a signal handler; processing X input may
call malloc; if input arrives while a matching routine is calling
malloc, then we're scrod. But Emacs can't just block input while
calling matching routines; then we don't notice interrupts when
they come in. So, Emacs blocks input around all regexp calls
except the matching calls, which it leaves unprotected, in the
faith that they will not malloc. */
/* Normally, this is fine. */
#define MATCH_MAY_ALLOCATE
/* When using GNU C, we are not REALLY using the C alloca, no matter
what config.h may say. So don't take precautions for it. */
#ifdef __GNUC__
#undef C_ALLOCA
#endif
/* The match routines may not allocate if (1) they would do it with malloc
and (2) it's not safe for them to use malloc.
Note that if REL_ALLOC is defined, matching would not use malloc for the
failure stack, but we would still use it for the register vectors;
so REL_ALLOC should not affect this. */
#if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
#undef MATCH_MAY_ALLOCATE
#endif
/* Failure stack declarations and macros; both re_compile_fastmap and
re_match_2 use a failure stack. These have to be macros because of
REGEX_ALLOCATE_STACK. */
/* Number of failure points for which to initially allocate space
when matching. If this number is exceeded, we allocate more
space, so it is not a hard limit. */
#ifndef INIT_FAILURE_ALLOC
#define INIT_FAILURE_ALLOC 5
#endif
/* Roughly the maximum number of failure points on the stack. Would be
exactly that if always used MAX_FAILURE_ITEMS items each time we failed.
This is a variable only so users of regex can assign to it; we never
change it ourselves. */
#if defined (MATCH_MAY_ALLOCATE)
/* 4400 was enough to cause a crash on Alpha OSF/1,
whose default stack limit is 2mb. */
int re_max_failures = 20000;
#else
int re_max_failures = 2000;
#endif
union fail_stack_elt
{
unsigned char *pointer;
int integer;
};
cs/util/regex/regex.c view on Meta::CPAN
We also want to fetch the endpoints without translating them; the
appropriate translation is done in the bit-setting loop below. */
/* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
range_start = ((const unsigned char *) p)[-2];
range_end = ((const unsigned char *) p)[0];
/* Have to increment the pointer into the pattern string, so the
caller isn't still at the ending character. */
(*p_ptr)++;
/* If the start is after the end, the range is empty. */
if (range_start > range_end)
return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
/* Here we see why `this_char' has to be larger than an `unsigned
char' -- the range is inclusive, so if `range_end' == 0xff
(assuming 8-bit characters), we would otherwise go into an infinite
loop, since all characters <= 0xff. */
for (this_char = range_start; this_char <= range_end; this_char++)
{
SET_LIST_BIT (TRANSLATE (this_char));
}
return REG_NOERROR;
}
/* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
characters can start a string that matches the pattern. This fastmap
is used by re_search to skip quickly over impossible starting points.
The caller must supply the address of a (1 << BYTEWIDTH)-byte data
area as BUFP->fastmap.
We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
the pattern buffer.
Returns 0 if we succeed, -2 if an internal error. */
int
re_compile_fastmap (bufp)
struct re_pattern_buffer *bufp;
{
int j, k;
#ifdef MATCH_MAY_ALLOCATE
fail_stack_type fail_stack;
#endif
#ifndef REGEX_MALLOC
char *destination;
#endif
/* We don't push any register information onto the failure stack. */
unsigned num_regs = 0;
register char *fastmap = bufp->fastmap;
unsigned char *pattern = bufp->buffer;
unsigned long size = bufp->used;
unsigned char *p = pattern;
register unsigned char *pend = pattern + size;
/* This holds the pointer to the failure stack, when
it is allocated relocatably. */
#ifdef REL_ALLOC
fail_stack_elt_t *failure_stack_ptr;
#endif
/* Assume that each path through the pattern can be null until
proven otherwise. We set this false at the bottom of switch
statement, to which we get only if a particular path doesn't
match the empty string. */
boolean path_can_be_null = true;
/* We aren't doing a `succeed_n' to begin with. */
boolean succeed_n_p = false;
assert (fastmap != NULL && p != NULL);
INIT_FAIL_STACK ();
bzero (fastmap, 1 << BYTEWIDTH); /* Assume nothing's valid. */
bufp->fastmap_accurate = 1; /* It will be when we're done. */
bufp->can_be_null = 0;
while (1)
{
if (p == pend || *p == succeed)
{
/* We have reached the (effective) end of pattern. */
if (!FAIL_STACK_EMPTY ())
{
bufp->can_be_null |= path_can_be_null;
/* Reset for next path. */
path_can_be_null = true;
p = fail_stack.stack[--fail_stack.avail].pointer;
continue;
}
else
break;
}
/* We should never be about to go beyond the end of the pattern. */
assert (p < pend);
switch (SWITCH_ENUM_CAST ((re_opcode_t) *p++))
{
/* I guess the idea here is to simply not bother with a fastmap
if a backreference is used, since it's too hard to figure out
the fastmap for the corresponding group. Setting
`can_be_null' stops `re_search_2' from using the fastmap, so
that is all we do. */
case duplicate:
bufp->can_be_null = 1;
goto done;
/* Following are the cases which match a character. These end
with `break'. */
case exactn:
cs/util/regex/regex.c view on Meta::CPAN
int result = re_match_2_internal (bufp, string1, size1, string2, size2,
pos, regs, stop);
alloca (0);
return result;
}
/* This is a separate function so that we can force an alloca cleanup
afterwards. */
static int
re_match_2_internal (bufp, string1, size1, string2, size2, pos, regs, stop)
struct re_pattern_buffer *bufp;
const char *string1, *string2;
int size1, size2;
int pos;
struct re_registers *regs;
int stop;
{
/* General temporaries. */
int mcnt;
unsigned char *p1;
/* Just past the end of the corresponding string. */
const char *end1, *end2;
/* Pointers into string1 and string2, just past the last characters in
each to consider matching. */
const char *end_match_1, *end_match_2;
/* Where we are in the data, and the end of the current string. */
const char *d, *dend;
/* Where we are in the pattern, and the end of the pattern. */
unsigned char *p = bufp->buffer;
register unsigned char *pend = p + bufp->used;
/* Mark the opcode just after a start_memory, so we can test for an
empty subpattern when we get to the stop_memory. */
unsigned char *just_past_start_mem = 0;
/* We use this to map every character in the string. */
RE_TRANSLATE_TYPE translate = bufp->translate;
/* Failure point stack. Each place that can handle a failure further
down the line pushes a failure point on this stack. It consists of
restart, regend, and reg_info for all registers corresponding to
the subexpressions we're currently inside, plus the number of such
registers, and, finally, two char *'s. The first char * is where
to resume scanning the pattern; the second one is where to resume
scanning the strings. If the latter is zero, the failure point is
a ``dummy''; if a failure happens and the failure point is a dummy,
it gets discarded and the next next one is tried. */
#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
fail_stack_type fail_stack;
#endif
#ifdef DEBUG
static unsigned failure_id = 0;
unsigned nfailure_points_pushed = 0, nfailure_points_popped = 0;
#endif
/* This holds the pointer to the failure stack, when
it is allocated relocatably. */
#ifdef REL_ALLOC
fail_stack_elt_t *failure_stack_ptr;
#endif
/* We fill all the registers internally, independent of what we
return, for use in backreferences. The number here includes
an element for register zero. */
unsigned num_regs = bufp->re_nsub + 1;
/* The currently active registers. */
unsigned lowest_active_reg = NO_LOWEST_ACTIVE_REG;
unsigned highest_active_reg = NO_HIGHEST_ACTIVE_REG;
/* Information on the contents of registers. These are pointers into
the input strings; they record just what was matched (on this
attempt) by a subexpression part of the pattern, that is, the
regnum-th regstart pointer points to where in the pattern we began
matching and the regnum-th regend points to right after where we
stopped matching the regnum-th subexpression. (The zeroth register
keeps track of what the whole pattern matches.) */
#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
const char **regstart, **regend;
#endif
/* If a group that's operated upon by a repetition operator fails to
match anything, then the register for its start will need to be
restored because it will have been set to wherever in the string we
are when we last see its open-group operator. Similarly for a
register's end. */
#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
const char **old_regstart, **old_regend;
#endif
/* The is_active field of reg_info helps us keep track of which (possibly
nested) subexpressions we are currently in. The matched_something
field of reg_info[reg_num] helps us tell whether or not we have
matched any of the pattern so far this time through the reg_num-th
subexpression. These two fields get reset each time through any
loop their register is in. */
#ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
register_info_type *reg_info;
#endif
/* The following record the register info as found in the above
variables when we find a match better than any we've seen before.
This happens as we backtrack through the failure points, which in
turn happens only if we have not yet matched the entire string. */
unsigned best_regs_set = false;
#ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
const char **best_regstart, **best_regend;
#endif
/* Logically, this is `best_regend[0]'. But we don't want to have to
allocate space for that if we're not allocating space for anything
else (see below). Also, we never need info about register 0 for
any of the other register vectors, and it seems rather a kludge to
treat `best_regend' differently than the rest. So we keep track of
the end of the best match so far in a separate variable. We
initialize this to NULL so that when we backtrack the first time
and need to test it, it's not garbage. */
( run in 0.430 second using v1.01-cache-2.11-cpan-5511b514fd6 )