Coverity fixes
[sxemacs] / src / regex.c
1 /* Extended regular expression matching and search library,
2    version 0.12, extended for XEmacs.
3    (Implements POSIX draft P10003.2/D11.2, except for
4    internationalization features.)
5
6    Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
7    Copyright (C) 1995 Sun Microsystems, Inc.
8    Copyright (C) 1995 Ben Wing.
9
10 This file is part of SXEmacs
11
12 SXEmacs is free software: you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation, either version 3 of the License, or
15 (at your option) any later version.
16
17 SXEmacs is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 GNU General Public License for more details.
21
22 You should have received a copy of the GNU General Public License
23 along with this program.  If not, see <http://www.gnu.org/licenses/>. */
24
25
26 /* Synched up with: FSF 19.29. */
27
28 /* Changes made for XEmacs:
29
30    (1) the REGEX_BEGLINE_CHECK code from the XEmacs v18 regex routines
31        was added.  This causes a huge speedup in font-locking.
32    (2) Rel-alloc is disabled when the MMAP version of rel-alloc is
33        being used, because it's too slow -- all those calls to mmap()
34        add humongous overhead.
35    (3) Lots and lots of changes for Mule.  They are bracketed by
36        `#ifdef MULE' or with comments that have `XEmacs' in them.
37  */
38
39 #ifdef HAVE_CONFIG_H
40 #include <config.h>
41 #endif
42
43 #ifndef REGISTER                /* Rigidly enforced as of 20.3 */
44 #define REGISTER
45 #endif
46
47 #ifndef _GNU_SOURCE
48 #define _GNU_SOURCE 1
49 #endif
50
51 #ifdef emacs
52 /* Converts the pointer to the char to BEG-based offset from the start.  */
53 #define PTR_TO_OFFSET(d) (MATCHING_IN_FIRST_STRING                      \
54                           ? (d) - string1 : (d) - (string2 - size1))
55 #else
56 #define PTR_TO_OFFSET(d) 0
57 #endif
58
59 /* We assume non-Mule if emacs isn't defined. */
60 #ifndef emacs
61 #undef MULE
62 #endif
63
64 /* We need this for `regex.h', and perhaps for the Emacs include files.  */
65 #include <sys/types.h>
66
67 /* This is for other GNU distributions with internationalized messages.  */
68 #if defined (I18N3) && (defined (HAVE_LIBINTL_H) || defined (_LIBC))
69 # include <libintl.h>
70 #else
71 # define gettext(msgid) (msgid)
72 #endif
73
74 /* XEmacs: define this to add in a speedup for patterns anchored at
75    the beginning of a line.  Keep the ifdefs so that it's easier to
76    tell where/why this code has diverged from v19. */
77 #define REGEX_BEGLINE_CHECK
78
79 /* XEmacs: the current mmap-based ralloc handles small blocks very
80    poorly, so we disable it here. */
81
82 #if (defined (REL_ALLOC) && defined (HAVE_MMAP)) || defined(DOUG_LEA_MALLOC)
83 # undef REL_ALLOC
84 #endif
85
86 /* The `emacs' switch turns on certain matching commands
87    that make sense only in Emacs. */
88 #ifdef emacs
89
90 #include "lisp.h"
91 #include "buffer.h"
92 #include "syntax.h"
93
94 #ifdef MULE
95
96 Lisp_Object Vthe_lisp_rangetab;
97
98 void complex_vars_of_regex(void)
99 {
100         Vthe_lisp_rangetab = Fmake_range_table();
101         staticpro(&Vthe_lisp_rangetab);
102 }
103
104 #else  /* not MULE */
105
106 void complex_vars_of_regex(void)
107 {
108 }
109
110 #endif  /* MULE */
111
112 #define RE_TRANSLATE(ch) TRT_TABLE_OF (translate, (Emchar) ch)
113 #define TRANSLATE_P(tr) (!NILP (tr) && CHAR_TABLEP(tr))
114
115 /* just massage that xfree thing */
116 #undef xfree
117 #if defined HAVE_BDWGC && defined EF_USE_BDWGC
118 # define xfree          zfree
119 #else  /* !BDWGC */
120 # define xfree          yfree
121 #endif  /* BDWGC */
122
123 #else  /* !emacs */
124
125 /* If we are not linking with Emacs proper,
126    we can't use the relocating allocator
127    even if config.h says that we can.  */
128 #undef REL_ALLOC
129
130 #if defined (STDC_HEADERS) || defined (_LIBC)
131 # include <stdlib.h>
132 # include <stdbool.h>
133 #else
134 char *malloc();
135 char *realloc();
136 #endif
137
138 /* Types normally included via lisp.h */
139 #include <stddef.h>             /* for ptrdiff_t */
140
141 #ifdef REGEX_MALLOC
142 #ifndef DECLARE_NOTHING
143 #define DECLARE_NOTHING struct nosuchstruct
144 #endif
145 #endif
146
147 typedef int Emchar;
148
149 #define charptr_emchar(str)             ((Emchar) (str)[0])
150
151 #define INC_CHARPTR(p) ((p)++)
152 #define DEC_CHARPTR(p) ((p)--)
153
154 #include <string.h>
155
156 /* Define the syntax stuff for \<, \>, etc.  */
157
158 /* This must be nonzero for the wordchar and notwordchar pattern
159    commands in re_match_2.  */
160 #ifndef Sword
161 #define Sword 1
162 #endif
163
164 #ifdef SYNTAX_TABLE
165
166 extern char *re_syntax_table;
167
168 #else                           /* not SYNTAX_TABLE */
169
170 /* How many characters in the character set.  */
171 #define CHAR_SET_SIZE 256
172
173 static char re_syntax_table[CHAR_SET_SIZE];
174
175 static void init_syntax_once(void)
176 {
177         static int done = 0;
178
179         if (!done) {
180                 const char *word_syntax_chars =
181                     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
182
183                 memset(re_syntax_table, 0, sizeof(re_syntax_table));
184
185                 while (*word_syntax_chars)
186                         re_syntax_table[(unsigned int)(*word_syntax_chars++)] =
187                             Sword;
188
189                 done = 1;
190         }
191 }
192
193 #endif                          /* SYNTAX_TABLE */
194
195 #define SYNTAX_UNSAFE(ignored, c) re_syntax_table[c]
196 #undef SYNTAX_FROM_CACHE
197 #define SYNTAX_FROM_CACHE SYNTAX_UNSAFE
198
199 #define RE_TRANSLATE(c) translate[(unsigned char) (c)]
200 #define TRANSLATE_P(tr) tr
201
202 #if defined HAVE_BDWGC && defined EF_USE_BDWGC
203 # if defined HAVE_GC_GC_H
204 #  include "gc/gc.h"
205 # elif defined HAVE_GC_H
206 #  include "gc.h"
207 # endif
208 # define xmalloc        GC_MALLOC
209 # define xmalloc_atomic GC_MALLOC_ATOMIC
210 # define xrealloc       GC_REALLOC
211 # define xfree          GC_FREE
212 # define xstrdup        GC_STRDUP
213 #else  /* !HAVE_BDWGC */
214 # define xmalloc        malloc
215 # define xmalloc_atomic malloc
216 # define xrealloc       realloc
217 # define xfree          free
218 # define xstrdup        strdup
219 #endif  /* HAVE_BDWGC */
220
221 #endif  /* emacs */
222
223
224 /* Under XEmacs, this is needed because we don't define it elsewhere. */
225 #ifdef SWITCH_ENUM_BUG
226 #define SWITCH_ENUM_CAST(x) ((int)(x))
227 #else
228 #define SWITCH_ENUM_CAST(x) (x)
229 #endif
230 \f
231 /* Get the interface, including the syntax bits.  */
232 #include "regex.h"
233
234 /* isalpha etc. are used for the character classes.  */
235 #include <ctype.h>
236
237 /* Jim Meyering writes:
238
239    "... Some ctype macros are valid only for character codes that
240    isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
241    using /bin/cc or gcc but without giving an ansi option).  So, all
242    ctype uses should be through macros like ISPRINT...  If
243    STDC_HEADERS is defined, then autoconf has verified that the ctype
244    macros don't need to be guarded with references to isascii. ...
245    Defining isascii to 1 should let any compiler worth its salt
246    eliminate the && through constant folding."  */
247
248 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
249 #define ISASCII_1(c) 1
250 #else
251 #define ISASCII_1(c) isascii(c)
252 #endif
253
254 #ifdef MULE
255 /* The IS*() macros can be passed any character, including an extended
256    one.  We need to make sure there are no crashes, which would occur
257    otherwise due to out-of-bounds array references. */
258 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c))
259 #else
260 #define ISASCII(c) ISASCII_1 (c)
261 #endif                          /* MULE */
262
263 #ifdef isblank
264 #define ISBLANK(c) (ISASCII (c) && isblank (c))
265 #else
266 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
267 #endif
268 #ifdef isgraph
269 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
270 #else
271 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
272 #endif
273
274 /* stolen from FSF/src/charset.h */
275 #define SINGLE_BYTE_CHAR_P(c)   (((unsigned int)(c) & 0xFF) == (c))
276 /* 1 if C is an ASCII character.  */
277 #define IS_REAL_ASCII(c) ((c) < 128)
278 /* 1 if C is a unibyte character.  */
279 #define ISUNIBYTE(c)    (SINGLE_BYTE_CHAR_P(c))
280
281 #define ISPRINT(c) (ISASCII (c) && isprint (c))
282 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
283 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
284 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
285 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
286 #define ISLOWER(c) (ISASCII (c) && islower (c))
287 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
288 #define ISSPACE(c) (ISASCII (c) && isspace (c))
289 #define ISUPPER(c) (ISASCII (c) && isupper (c))
290 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
291
292 #if defined emacs
293 static inline bool
294 ISWORD(int c)
295         __attribute__((always_inline));
296 static inline bool
297 ISWORD(int c)
298 {
299         Lisp_Object ct = regex_emacs_buffer->mirror_syntax_table;
300
301         return (SYNTAX(XCHAR_TABLE(ct), c) == Sword);
302 }
303 #else
304 # define ISWORD(c)      (ISALPHA(c))
305 #endif  /* emacs */
306
307 #ifndef NULL
308 #define NULL (void *)0
309 #endif
310
311 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
312    since ours (we hope) works properly with all combinations of
313    machines, compilers, `char' and `unsigned char' argument types.
314    (Per Bothner suggested the basic approach.)  */
315 #undef SIGN_EXTEND_CHAR
316 #if __STDC__
317 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
318 #else                           /* not __STDC__ */
319 /* As in Harbison and Steele.  */
320 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
321 #endif
322 \f
323 /* Should we use malloc or alloca?  If REGEX_MALLOC is not defined, we
324    use `alloca' instead of `malloc'.  This is because using malloc in
325    re_search* or re_match* could cause memory leaks when C-g is used in
326    Emacs; also, malloc is slower and causes storage fragmentation.  On
327    the other hand, malloc is more portable, and easier to debug.
328
329    Because we sometimes use alloca, some routines have to be macros,
330    not functions -- `alloca'-allocated space disappears at the end of the
331    function it is called in.  */
332
333 #ifdef REGEX_MALLOC
334
335 #define REGEX_ALLOCATE  xmalloc
336 #define REGEX_REALLOCATE(source, osize, nsize) xrealloc(source, nsize)
337 #define REGEX_FREE      xfree
338
339 #else  /* !REGEX_MALLOC */
340
341 /* Emacs already defines alloca, sometimes.  */
342 #ifndef alloca
343
344 /* Make alloca work the best possible way.  */
345 #ifdef __GNUC__
346 #define alloca __builtin_alloca
347 #else                           /* not __GNUC__ */
348 #if HAVE_ALLOCA_H
349 #include <alloca.h>
350 #else                           /* not __GNUC__ or HAVE_ALLOCA_H */
351 #ifndef _AIX                    /* Already did AIX, up at the top.  */
352 void *alloca();
353 #endif                          /* not _AIX */
354 #endif                          /* HAVE_ALLOCA_H */
355 #endif                          /* __GNUC__ */
356
357 #endif                          /* not alloca */
358
359 #define REGEX_ALLOCATE alloca
360
361 /* Assumes a `char *destination' variable.  */
362 #define REGEX_REALLOCATE(source, osize, nsize)                          \
363   (destination = (char *) alloca (nsize),                               \
364    memmove (destination, source, osize),                                \
365    destination)
366
367 /* No need to do anything to free, after alloca.  */
368 #define REGEX_FREE(arg) ((void)0)       /* Do nothing!  But inhibit gcc warning.  */
369
370 #endif  /* REGEX_MALLOC */
371
372 /* Define how to allocate the failure stack.  */
373
374 #ifdef REL_ALLOC
375 #error not again
376 #define REGEX_ALLOCATE_STACK(size)                              \
377         r_alloc ((char **) &failure_stack_ptr, (size))
378 #define REGEX_REALLOCATE_STACK(source, osize, nsize)            \
379         r_re_alloc ((char **) &failure_stack_ptr, (nsize))
380 #define REGEX_FREE_STACK(ptr)                                   \
381         r_alloc_free ((void **) &failure_stack_ptr)
382
383 #else  /* !REL_ALLOC */
384
385 #ifdef REGEX_MALLOC
386
387 #define REGEX_ALLOCATE_STACK    xmalloc
388 #define REGEX_REALLOCATE_STACK(source, osize, nsize) xrealloc(source, nsize)
389 #define REGEX_FREE_STACK        xfree
390
391 #else  /* !REGEX_MALLOC */
392
393 #define REGEX_ALLOCATE_STACK alloca
394
395 #define REGEX_REALLOCATE_STACK(source, osize, nsize)                    \
396    REGEX_REALLOCATE (source, osize, nsize)
397 /* No need to explicitly free anything.  */
398 #define REGEX_FREE_STACK(arg)
399
400 #endif  /* REGEX_MALLOC */
401 #endif  /* REL_ALLOC */
402
403 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
404    `string1' or just past its end.  This works if PTR is NULL, which is
405    a good thing.  */
406 #define FIRST_STRING_P(ptr)                                     \
407   (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
408
409 /* (Re)Allocate N items of type T using malloc, or fail.  */
410 #define TALLOC(n, t)                            \
411         ((t *)xmalloc((n) * sizeof (t)))
412 #define TALLOC_ATOMIC(n, t)                     \
413         ((t *)xmalloc_atomic((n) * sizeof (t)))
414 #define RETALLOC(addr, n, t)                                    \
415         ((addr) = (t *)xrealloc(addr, (n) * sizeof (t)))
416 #define RETALLOC_IF(addr, n, t)                                 \
417         if (addr) {                                             \
418                 RETALLOC((addr), (n), t);                       \
419         } else                                                  \
420                 (addr) = TALLOC ((n), t)
421 #define REGEX_TALLOC(n, t)                              \
422         ((t *)REGEX_ALLOCATE((n) * sizeof (t)))
423
424 #define BYTEWIDTH 8             /* In bits.  */
425
426 #define STREQ(s1, s2) (strcmp (s1, s2) == 0)
427
428 #undef MAX
429 #undef MIN
430 #define MAX(a, b) ((a) > (b) ? (a) : (b))
431 #define MIN(a, b) ((a) < (b) ? (a) : (b))
432
433 /* Type of source-pattern and string chars.  */
434 typedef const unsigned char re_char;
435
436 typedef char re_bool;
437 #define false 0
438 #define true 1
439 \f
440 /* These are the command codes that appear in compiled regular
441    expressions.  Some opcodes are followed by argument bytes.  A
442    command code can specify any interpretation whatsoever for its
443    arguments.  Zero bytes may appear in the compiled regular expression.  */
444
445 typedef enum {
446         no_op = 0,
447
448         /* Succeed right away--no more backtracking.  */
449         succeed,
450
451         /* Followed by one byte giving n, then by n literal bytes.  */
452         exactn,
453
454         /* Matches any (more or less) character.  */
455         anychar,
456
457         /* Matches any one char belonging to specified set.  First
458            following byte is number of bitmap bytes.  Then come bytes
459            for a bitmap saying which chars are in.  Bits in each byte
460            are ordered low-bit-first.  A character is in the set if its
461            bit is 1.  A character too large to have a bit in the map is
462            automatically not in the set.  */
463         charset,
464
465         /* Same parameters as charset, but match any character that is
466            not one of those specified.  */
467         charset_not,
468
469         /* Start remembering the text that is matched, for storing in a
470            register.  Followed by one byte with the register number, in
471            the range 1 to the pattern buffer's re_ngroups
472            field.  Then followed by one byte with the number of groups
473            inner to this one.  (This last has to be part of the
474            start_memory only because we need it in the on_failure_jump
475            of re_match_2.)  */
476         start_memory,
477
478         /* Stop remembering the text that is matched and store it in a
479            memory register.  Followed by one byte with the register
480            number, in the range 1 to `re_ngroups' in the
481            pattern buffer, and one byte with the number of inner groups,
482            just like `start_memory'.  (We need the number of inner
483            groups here because we don't have any easy way of finding the
484            corresponding start_memory when we're at a stop_memory.)  */
485         stop_memory,
486
487         /* Match a duplicate of something remembered. Followed by one
488            byte containing the register number.  */
489         duplicate,
490
491         /* Fail unless at beginning of line.  */
492         begline,
493
494         /* Fail unless at end of line.  */
495         endline,
496
497         /* Succeeds if at beginning of buffer (if emacs) or at beginning
498            of string to be matched (if not).  */
499         begbuf,
500
501         /* Analogously, for end of buffer/string.  */
502         endbuf,
503
504         /* Followed by two byte relative address to which to jump.  */
505         jump,
506
507         /* Same as jump, but marks the end of an alternative.  */
508         jump_past_alt,
509
510         /* Followed by two-byte relative address of place to resume at
511            in case of failure.  */
512         on_failure_jump,
513
514         /* Like on_failure_jump, but pushes a placeholder instead of the
515            current string position when executed.  */
516         on_failure_keep_string_jump,
517
518         /* Throw away latest failure point and then jump to following
519            two-byte relative address.  */
520         pop_failure_jump,
521
522         /* Change to pop_failure_jump if know won't have to backtrack to
523            match; otherwise change to jump.  This is used to jump
524            back to the beginning of a repeat.  If what follows this jump
525            clearly won't match what the repeat does, such that we can be
526            sure that there is no use backtracking out of repetitions
527            already matched, then we change it to a pop_failure_jump.
528            Followed by two-byte address.  */
529         maybe_pop_jump,
530
531         /* Jump to following two-byte address, and push a dummy failure
532            point. This failure point will be thrown away if an attempt
533            is made to use it for a failure.  A `+' construct makes this
534            before the first repeat.  Also used as an intermediary kind
535            of jump when compiling an alternative.  */
536         dummy_failure_jump,
537
538         /* Push a dummy failure point and continue.  Used at the end of
539            alternatives.  */
540         push_dummy_failure,
541
542         /* Followed by two-byte relative address and two-byte number n.
543            After matching N times, jump to the address upon failure.  */
544         succeed_n,
545
546         /* Followed by two-byte relative address, and two-byte number n.
547            Jump to the address N times, then fail.  */
548         jump_n,
549
550         /* Set the following two-byte relative address to the
551            subsequent two-byte number.  The address *includes* the two
552            bytes of number.  */
553         set_number_at,
554
555         wordchar,               /* Matches any word-constituent character.  */
556         notwordchar,            /* Matches any char that is not a word-constituent.  */
557
558         wordbeg,                /* Succeeds if at word beginning.  */
559         wordend,                /* Succeeds if at word end.  */
560
561         wordbound,              /* Succeeds if at a word boundary.  */
562         notwordbound            /* Succeeds if not at a word boundary.  */
563 #ifdef emacs
564             , before_dot,       /* Succeeds if before point.  */
565         at_dot,                 /* Succeeds if at point.  */
566         after_dot,              /* Succeeds if after point.  */
567
568         /* Matches any character whose syntax is specified.  Followed by
569            a byte which contains a syntax code, e.g., Sword.  */
570         syntaxspec,
571
572         /* Matches any character whose syntax is not that specified.  */
573         notsyntaxspec
574 #endif                          /* emacs */
575 #ifdef MULE
576             /* need extra stuff to be able to properly work with XEmacs/Mule
577                characters (which may take up more than one byte) */
578             , charset_mule,     /* Matches any character belonging to specified set.
579                                    The set is stored in "unified range-table
580                                    format"; see rangetab.c.  Unlike the `charset'
581                                    opcode, this can handle arbitrary characters. */
582
583         charset_mule_not        /* Same parameters as charset_mule, but match any
584                                    character that is not one of those specified.  */
585             /* 97/2/17 jhod: The following two were merged back in from the Mule
586                2.3 code to enable some language specific processing */
587             , categoryspec,     /* Matches entries in the character category tables */
588         notcategoryspec         /* The opposite of the above */
589 #endif                          /* MULE */
590 } re_opcode_t;
591 \f
592 /* Common operations on the compiled pattern.  */
593
594 /* Store NUMBER in two contiguous bytes starting at DESTINATION.  */
595
596 #define STORE_NUMBER(destination, number)                               \
597   do {                                                                  \
598     (destination)[0] = (number) & 0377;                                 \
599     (destination)[1] = (number) >> 8;                                   \
600   } while (0)
601
602 /* Same as STORE_NUMBER, except increment DESTINATION to
603    the byte after where the number is stored.  Therefore, DESTINATION
604    must be an lvalue.  */
605
606 #define STORE_NUMBER_AND_INCR(destination, number)                      \
607   do {                                                                  \
608     STORE_NUMBER (destination, number);                                 \
609     (destination) += 2;                                                 \
610   } while (0)
611
612 /* Put into DESTINATION a number stored in two contiguous bytes starting
613    at SOURCE.  */
614
615 #define EXTRACT_NUMBER(destination, source)                             \
616   do {                                                                  \
617     (destination) = *(source) & 0377;                                   \
618     (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8;           \
619   } while (0)
620
621 #ifdef REGEX_DEBUG_FLAG
622 static void extract_number(int *dest, re_char * source)
623 {
624         int temp = SIGN_EXTEND_CHAR(*(source + 1));
625         *dest = *source & 0377;
626         *dest += temp << 8;
627 }
628
629 #ifndef EXTRACT_MACROS          /* To debug the macros.  */
630 #undef EXTRACT_NUMBER
631 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
632 #endif                          /* not EXTRACT_MACROS */
633
634 #endif                          /* REGEX_DEBUG_FLAG */
635
636 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
637    SOURCE must be an lvalue.  */
638
639 #define EXTRACT_NUMBER_AND_INCR(destination, source)                    \
640         do {                                                            \
641                 EXTRACT_NUMBER (destination, source);                   \
642                 (source) += 2;                                          \
643         } while (0)
644
645 #ifdef REGEX_DEBUG_FLAG
646 static void
647 extract_number_and_incr(int *destination, unsigned char **source)
648 {
649         extract_number(destination, *source);
650         *source += 2;
651 }
652
653 #ifndef EXTRACT_MACROS
654 #undef EXTRACT_NUMBER_AND_INCR
655 #define EXTRACT_NUMBER_AND_INCR(dest, src)      \
656         extract_number_and_incr (&dest, &src)
657 #endif                          /* not EXTRACT_MACROS */
658
659 #endif                          /* REGEX_DEBUG_FLAG */
660 \f
661 /* If REGEX_DEBUG_FLAG is defined, Regex prints many voluminous messages about
662    what it is doing (if the variable `debug' is nonzero).  If linked with the
663    main program in `iregex.c', you can enter patterns and strings interactively.
664    And if linked with the main program in `main.c' and the other test files, you
665    can run the already-written tests.  */
666
667 #if defined (REGEX_DEBUG_FLAG)
668
669 /* We use standard I/O for debugging.  */
670 #include <stdio.h>
671
672 #ifndef emacs
673 /* XEmacs provides its own version of assert() */
674 /* It is useful to test things that ``must'' be true when debugging.  */
675 #include <assert.h>
676 #endif
677
678 #if 0
679 /* no point in having this one, since we do not offer any accessor to this */
680 static int debug = 0;
681 #endif
682
683 #define DEBUG_STATEMENT(e)              e
684 #define DEBUG_PRINT1(x)                 printf(x)
685 #define DEBUG_PRINT2(x1, x2)            printf(x1, x2)
686 #define DEBUG_PRINT3(x1, x2, x3)        printf(x1, x2, x3)
687 #define DEBUG_PRINT4(x1, x2, x3, x4)    printf(x1, x2, x3, x4)
688 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)                           \
689         print_partial_compiled_pattern(s, e)
690 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)                  \
691         print_double_string (w, s1, sz1, s2, sz2)
692
693 /* Print the fastmap in human-readable form.  */
694
695 static void print_fastmap(char *fastmap)
696 {
697         unsigned was_a_range = 0;
698         unsigned i = 0;
699
700         while (i < (1 << BYTEWIDTH)) {
701                 if (fastmap[i++]) {
702                         was_a_range = 0;
703                         putchar(i - 1);
704                         while (i < (1 << BYTEWIDTH) && fastmap[i]) {
705                                 was_a_range = 1;
706                                 i++;
707                         }
708                         if (was_a_range) {
709                                 putchar('-');
710                                 putchar(i - 1);
711                         }
712                 }
713         }
714         putchar('\n');
715 }
716
717 /* Print a compiled pattern string in human-readable form, starting at
718    the START pointer into it and ending just before the pointer END.  */
719
720 /* can't help it ... the ugly mule code changes `start' upon alignment,
721  * so this can't have the const qualifier ... bugger */
722 static void
723 print_partial_compiled_pattern(unsigned char *start, re_char *end)
724 {
725         int mcnt, mcnt2;
726         unsigned char *p = start;
727         re_char *pend = end;
728
729         if (start == NULL) {
730                 puts("(null)");
731                 return;
732         }
733
734         /* Loop over pattern commands.  */
735         while (p < pend) {
736                 printf("%ld:\t", (long)(p - start));
737
738                 switch ((const re_opcode_t)*p++) {
739                 case no_op:
740                         printf("/no_op");
741                         break;
742
743                 case exactn:
744                         mcnt = *p++;
745                         printf("/exactn/%d", mcnt);
746                         do {
747                                 putchar('/');
748                                 putchar(*p++);
749                         }
750                         while (--mcnt);
751                         break;
752
753                 case start_memory:
754                         mcnt = *p++;
755                         printf("/start_memory/%d/%d", mcnt, *p++);
756                         break;
757
758                 case stop_memory:
759                         mcnt = *p++;
760                         printf("/stop_memory/%d/%d", mcnt, *p++);
761                         break;
762
763                 case duplicate:
764                         printf("/duplicate/%d", *p++);
765                         break;
766
767                 case anychar:
768                         printf("/anychar");
769                         break;
770
771                 case charset:
772                 case charset_not: {
773                         REGISTER int c, last = -100;
774                         REGISTER int in_range = 0;
775
776                         printf("/charset [%s",
777                                (re_opcode_t) * (p - 1) == charset_not
778                                ? "^" : "");
779
780                         assert(p + *p < pend);
781
782                         for (c = 0; c < 256; c++)
783                                 if (((unsigned char)(c / 8) < *p)
784                                     && (p[1 + (c / 8)] &
785                                         (1 << (c % 8)))) {
786                                         /* Are we starting a range?  */
787                                         if (last + 1 == c && !in_range) {
788                                                 putchar('-');
789                                                 in_range = 1;
790                                         }
791                                         /* Have we broken a range?  */
792                                         else if (last + 1 != c
793                                                  && in_range) {
794                                                 putchar(last);
795                                                 in_range = 0;
796                                         }
797
798                                         if (!in_range)
799                                                 putchar(c);
800
801                                         last = c;
802                                 }
803
804                         if (in_range)
805                                 putchar(last);
806
807                         putchar(']');
808
809                         p += 1 + *p;
810                         break;
811                 }
812
813 #ifdef MULE
814                 case charset_mule:
815                 case charset_mule_not: {
816                         int nentries, i;
817
818                         printf("/charset_mule [%s",
819                                (re_opcode_t) * (p - 1) == charset_mule_not
820                                ? "^" : "");
821                         /* u_r_t_nentries() CHANGES its arg, why on earth
822                          * is this marked const here then? :O -hrop */
823                         nentries = unified_range_table_nentries(p);
824                         for (i = 0; i < nentries; i++) {
825                                 EMACS_INT first, last;
826                                 Lisp_Object dummy_val;
827
828                                 /* u_r_t_get_range CHANGES its arg ...
829                                  * p can't be const thence */
830                                 unified_range_table_get_range(p, i,
831                                                               &first,
832                                                               &last,
833                                                               &dummy_val);
834                                 if (first < 0x100)
835                                         putchar(first);
836                                 else
837                                         printf("(0x%lx)", (long)first);
838                                 if (first != last) {
839                                         putchar('-');
840                                         if (last < 0x100)
841                                                 putchar(last);
842                                         else
843                                                 printf("(0x%lx)",
844                                                        (long)last);
845                                 }
846                         }
847                         putchar(']');
848                         p += unified_range_table_bytes_used(p);
849                 }
850                         break;
851 #endif
852
853                 case begline:
854                         printf("/begline");
855                         break;
856
857                 case endline:
858                         printf("/endline");
859                         break;
860
861                 case on_failure_jump:
862                         extract_number_and_incr(&mcnt, &p);
863                         printf("/on_failure_jump to %ld",
864                                (long)(p + mcnt - start));
865                         break;
866
867                 case on_failure_keep_string_jump:
868                         extract_number_and_incr(&mcnt, &p);
869                         printf("/on_failure_keep_string_jump to %ld",
870                                (long)(p + mcnt - start));
871                         break;
872
873                 case dummy_failure_jump:
874                         extract_number_and_incr(&mcnt, &p);
875                         printf("/dummy_failure_jump to %ld",
876                                (long)(p + mcnt - start));
877                         break;
878
879                 case push_dummy_failure:
880                         printf("/push_dummy_failure");
881                         break;
882
883                 case maybe_pop_jump:
884                         extract_number_and_incr(&mcnt, &p);
885                         printf("/maybe_pop_jump to %ld",
886                                (long)(p + mcnt - start));
887                         break;
888
889                 case pop_failure_jump:
890                         extract_number_and_incr(&mcnt, &p);
891                         printf("/pop_failure_jump to %ld",
892                                (long)(p + mcnt - start));
893                         break;
894
895                 case jump_past_alt:
896                         extract_number_and_incr(&mcnt, &p);
897                         printf("/jump_past_alt to %ld",
898                                (long)(p + mcnt - start));
899                         break;
900
901                 case jump:
902                         extract_number_and_incr(&mcnt, &p);
903                         printf("/jump to %ld", (long)(p + mcnt - start));
904                         break;
905
906                 case succeed_n:
907                         extract_number_and_incr(&mcnt, &p);
908                         extract_number_and_incr(&mcnt2, &p);
909                         printf("/succeed_n to %ld, %d times",
910                                (long)(p + mcnt - start), mcnt2);
911                         break;
912
913                 case jump_n:
914                         extract_number_and_incr(&mcnt, &p);
915                         extract_number_and_incr(&mcnt2, &p);
916                         printf("/jump_n to %ld, %d times",
917                                (long)(p + mcnt - start), mcnt2);
918                         break;
919
920                 case set_number_at:
921                         extract_number_and_incr(&mcnt, &p);
922                         extract_number_and_incr(&mcnt2, &p);
923                         printf("/set_number_at location %ld to %d",
924                                (long)(p + mcnt - start), mcnt2);
925                         break;
926
927                 case wordbound:
928                         printf("/wordbound");
929                         break;
930
931                 case notwordbound:
932                         printf("/notwordbound");
933                         break;
934
935                 case wordbeg:
936                         printf("/wordbeg");
937                         break;
938
939                 case wordend:
940                         printf("/wordend");
941
942 #ifdef emacs
943                 case before_dot:
944                         printf("/before_dot");
945                         break;
946
947                 case at_dot:
948                         printf("/at_dot");
949                         break;
950
951                 case after_dot:
952                         printf("/after_dot");
953                         break;
954
955                 case syntaxspec:
956                         printf("/syntaxspec");
957                         mcnt = *p++;
958                         printf("/%d", mcnt);
959                         break;
960
961                 case notsyntaxspec:
962                         printf("/notsyntaxspec");
963                         mcnt = *p++;
964                         printf("/%d", mcnt);
965                         break;
966
967 #ifdef MULE
968 /* 97/2/17 jhod Mule category patch */
969                 case categoryspec:
970                         printf("/categoryspec");
971                         mcnt = *p++;
972                         printf("/%d", mcnt);
973                         break;
974
975                 case notcategoryspec:
976                         printf("/notcategoryspec");
977                         mcnt = *p++;
978                         printf("/%d", mcnt);
979                         break;
980 /* end of category patch */
981 #endif                          /* MULE */
982 #endif                          /* emacs */
983
984                 case wordchar:
985                         printf("/wordchar");
986                         break;
987
988                 case notwordchar:
989                         printf("/notwordchar");
990                         break;
991
992                 case begbuf:
993                         printf("/begbuf");
994                         break;
995
996                 case endbuf:
997                         printf("/endbuf");
998                         break;
999
1000                 case succeed:
1001                 default:
1002                         printf("?%d", *(p - 1));
1003                 }
1004
1005                 putchar('\n');
1006         }
1007
1008         printf("%ld:\tend of pattern.\n", (long)(p - start));
1009 }
1010
1011 static void
1012 print_compiled_pattern(struct re_pattern_buffer *bufp)
1013 {
1014 #if 0
1015         /* cant be const */
1016         re_char *buffer = bufp->buffer;
1017 #else
1018         unsigned char *buffer = bufp->buffer;
1019 #endif
1020
1021         print_partial_compiled_pattern(buffer, buffer + bufp->used);
1022         printf("%ld bytes used/%ld bytes allocated.\n", bufp->used,
1023                bufp->allocated);
1024
1025         if (bufp->fastmap_accurate && bufp->fastmap) {
1026                 printf("fastmap: ");
1027                 print_fastmap(bufp->fastmap);
1028         }
1029
1030         printf("re_nsub: %ld\t", (long)bufp->re_nsub);
1031         printf("re_ngroups: %ld\t", (long)bufp->re_ngroups);
1032         printf("regs_alloc: %d\t", bufp->regs_allocated);
1033         printf("can_be_null: %d\t", bufp->can_be_null);
1034         printf("newline_anchor: %d\n", bufp->newline_anchor);
1035         printf("no_sub: %d\t", bufp->no_sub);
1036         printf("not_bol: %d\t", bufp->not_bol);
1037         printf("not_eol: %d\t", bufp->not_eol);
1038         printf("syntax: %d\n", bufp->syntax);
1039         /* Perhaps we should print the translate table?  */
1040         /* and maybe the category table? */
1041
1042         if (bufp->external_to_internal_register) {
1043                 int i;
1044
1045                 printf("external_to_internal_register:\n");
1046                 for (i = 0; i <= bufp->re_nsub; i++) {
1047                         if (i > 0)
1048                                 printf(", ");
1049                         printf("%d -> %d", i,
1050                                bufp->external_to_internal_register[i]);
1051                 }
1052                 printf("\n");
1053         }
1054 }
1055
1056 static void
1057 print_double_string(re_char * where, re_char * string1, int size1,
1058                     re_char * string2, int size2)
1059 {
1060         if (where == NULL)
1061                 printf("(null)");
1062         else {
1063                 Element_count this_char;
1064
1065                 if (FIRST_STRING_P(where)) {
1066                         for (this_char = where - string1; this_char < size1;
1067                              this_char++)
1068                                 putchar(string1[this_char]);
1069
1070                         where = string2;
1071                 }
1072
1073                 for (this_char = where - string2; this_char < size2;
1074                      this_char++)
1075                         putchar(string2[this_char]);
1076         }
1077 }
1078
1079 #else                           /* not REGEX_DEBUG_FLAG */
1080
1081 #undef assert
1082 #define assert(e)
1083
1084 #define DEBUG_STATEMENT(e)
1085 #define DEBUG_PRINT1(x)
1086 #define DEBUG_PRINT2(x1, x2)
1087 #define DEBUG_PRINT3(x1, x2, x3)
1088 #define DEBUG_PRINT4(x1, x2, x3, x4)
1089 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1090 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1091
1092 #endif                          /* REGEX_DEBUG_FLAG */
1093 \f
1094 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
1095    also be assigned to arbitrarily: each pattern buffer stores its own
1096    syntax, so it can be changed between regex compilations.  */
1097 /* This has no initializer because initialized variables in Emacs
1098    become read-only after dumping.  */
1099 reg_syntax_t re_syntax_options;
1100
1101 /* Specify the precise syntax of regexps for compilation.  This provides
1102    for compatibility for various utilities which historically have
1103    different, incompatible syntaxes.
1104
1105    The argument SYNTAX is a bit mask comprised of the various bits
1106    defined in regex.h.  We return the old syntax.  */
1107
1108 reg_syntax_t re_set_syntax(reg_syntax_t syntax)
1109 {
1110         reg_syntax_t ret = re_syntax_options;
1111
1112         re_syntax_options = syntax;
1113         return ret;
1114 }
1115 \f
1116 /* This table gives an error message for each of the error codes listed
1117    in regex.h.  Obviously the order here has to be same as there.
1118    POSIX doesn't require that we do anything for REG_NOERROR,
1119    but why not be nice?  */
1120
1121 static const char *re_error_msgid[] = {
1122         "Success",              /* REG_NOERROR */
1123         "No match",             /* REG_NOMATCH */
1124         "Invalid regular expression",   /* REG_BADPAT */
1125         "Invalid collation character",  /* REG_ECOLLATE */
1126         "Invalid character class name", /* REG_ECTYPE */
1127         "Trailing backslash",   /* REG_EESCAPE */
1128         "Invalid back reference",       /* REG_ESUBREG */
1129         "Unmatched [ or [^",    /* REG_EBRACK */
1130         "Unmatched ( or \\(",   /* REG_EPAREN */
1131         "Unmatched \\{",        /* REG_EBRACE */
1132         "Invalid content of \\{\\}",    /* REG_BADBR */
1133         "Invalid range end",    /* REG_ERANGE */
1134         "Memory exhausted",     /* REG_ESPACE */
1135         "Invalid preceding regular expression", /* REG_BADRPT */
1136         "Premature end of regular expression",  /* REG_EEND */
1137         "Regular expression too big",   /* REG_ESIZE */
1138         "Unmatched ) or \\)",   /* REG_ERPAREN */
1139 #ifdef emacs
1140         "Invalid syntax designator",    /* REG_ESYNTAX */
1141 #endif
1142 #ifdef MULE
1143         "Ranges may not span charsets", /* REG_ERANGESPAN */
1144         "Invalid category designator",  /* REG_ECATEGORY */
1145 #endif
1146 };
1147 \f
1148 /* Avoiding alloca during matching, to placate r_alloc.  */
1149
1150 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1151    searching and matching functions should not call alloca.  On some
1152    systems, alloca is implemented in terms of malloc, and if we're
1153    using the relocating allocator routines, then malloc could cause a
1154    relocation, which might (if the strings being searched are in the
1155    ralloc heap) shift the data out from underneath the regexp
1156    routines.
1157
1158    Here's another reason to avoid allocation: Emacs
1159    processes input from X in a signal handler; processing X input may
1160    call malloc; if input arrives while a matching routine is calling
1161    malloc, then we're scrod.  But Emacs can't just block input while
1162    calling matching routines; then we don't notice interrupts when
1163    they come in.  So, Emacs blocks input around all regexp calls
1164    except the matching calls, which it leaves unprotected, in the
1165    faith that they will not malloc.  */
1166
1167 /* Normally, this is fine.  */
1168 #define MATCH_MAY_ALLOCATE
1169
1170 /* When using GNU C, we are not REALLY using the C alloca, no matter
1171    what config.h may say.  So don't take precautions for it.  */
1172 #ifdef __GNUC__
1173 #undef C_ALLOCA
1174 #endif
1175
1176 /* The match routines may not allocate if (1) they would do it with malloc
1177    and (2) it's not safe for them to use malloc.
1178    Note that if REL_ALLOC is defined, matching would not use malloc for the
1179    failure stack, but we would still use it for the register vectors;
1180    so REL_ALLOC should not affect this.  */
1181 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1182 #undef MATCH_MAY_ALLOCATE
1183 #endif
1184 \f
1185 /* Failure stack declarations and macros; both re_compile_fastmap and
1186    re_match_2 use a failure stack.  These have to be macros because of
1187    REGEX_ALLOCATE_STACK.  */
1188
1189 /* Number of failure points for which to initially allocate space
1190    when matching.  If this number is exceeded, we allocate more
1191    space, so it is not a hard limit.  */
1192 #ifndef INIT_FAILURE_ALLOC
1193 #define INIT_FAILURE_ALLOC 20
1194 #endif
1195
1196 /* Roughly the maximum number of failure points on the stack.  Would be
1197    exactly that if always used MAX_FAILURE_SPACE each time we failed.
1198    This is a variable only so users of regex can assign to it; we never
1199    change it ourselves.  */
1200 #if defined (MATCH_MAY_ALLOCATE) || defined (REGEX_MALLOC)
1201 /* 4400 was enough to cause a crash on Alpha OSF/1,
1202    whose default stack limit is 2mb.  */
1203 int re_max_failures = 50000;
1204 #else
1205 int re_max_failures = 2000;
1206 #endif
1207
1208 union fail_stack_elt {
1209         const unsigned char *pointer;
1210         int integer;
1211 };
1212
1213 typedef union fail_stack_elt fail_stack_elt_t;
1214
1215 typedef struct {
1216         fail_stack_elt_t *stack;
1217         Element_count size;
1218         Element_count avail;    /* Offset of next open position.  */
1219 } fail_stack_type;
1220
1221 #define FAIL_STACK_EMPTY()     (fail_stack.avail == 0)
1222 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1223 #define FAIL_STACK_FULL()      (fail_stack.avail == fail_stack.size)
1224
1225 /* Define macros to initialize and free the failure stack.
1226    Do `return -2' if the alloc fails.  */
1227
1228 #ifdef MATCH_MAY_ALLOCATE
1229 #define INIT_FAIL_STACK()                                               \
1230         do {                                                            \
1231                 fail_stack.stack = (fail_stack_elt_t *)                 \
1232                         REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC *      \
1233                                               sizeof (fail_stack_elt_t)); \
1234                                                                         \
1235                 if (fail_stack.stack == NULL)                           \
1236                         return -2;                                      \
1237                                                                         \
1238                 fail_stack.size = INIT_FAILURE_ALLOC;                   \
1239                 fail_stack.avail = 0;                                   \
1240         } while (0)
1241
1242 #define RESET_FAIL_STACK()  REGEX_FREE_STACK (fail_stack.stack)
1243 #else
1244 #define INIT_FAIL_STACK()                       \
1245         do {                                    \
1246                 fail_stack.avail = 0;           \
1247         } while (0)
1248
1249 #define RESET_FAIL_STACK()
1250 #endif
1251
1252 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1253
1254    Return 1 if succeeds, and 0 if either ran out of memory
1255    allocating space for it or it was already too large.
1256
1257    REGEX_REALLOCATE_STACK requires `destination' be declared.   */
1258
1259 #define DOUBLE_FAIL_STACK(fail_stack)                                   \
1260   ((int) (fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS        \
1261    ? 0                                                                  \
1262    : ((fail_stack).stack = (fail_stack_elt_t *)                         \
1263         REGEX_REALLOCATE_STACK ((fail_stack).stack,                     \
1264           (fail_stack).size * sizeof (fail_stack_elt_t),                \
1265           ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)),        \
1266                                                                         \
1267       (fail_stack).stack == NULL                                        \
1268       ? 0                                                               \
1269       : ((fail_stack).size <<= 1,                                       \
1270          1)))
1271
1272 /* Push pointer POINTER on FAIL_STACK.
1273    Return 1 if was able to do so and 0 if ran out of memory allocating
1274    space to do so.  */
1275 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK)                            \
1276   ((FAIL_STACK_FULL ()                                                  \
1277     && !DOUBLE_FAIL_STACK (FAIL_STACK))                                 \
1278    ? 0                                                                  \
1279    : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER,       \
1280       1))
1281
1282 /* Push a pointer value onto the failure stack.
1283    Assumes the variable `fail_stack'.  Probably should only
1284    be called from within `PUSH_FAILURE_POINT'.  */
1285 #define PUSH_FAILURE_POINTER(item)                                      \
1286         fail_stack.stack[fail_stack.avail++].pointer =                  \
1287                 (const unsigned char*)(item)
1288
1289 /* This pushes an integer-valued item onto the failure stack.
1290    Assumes the variable `fail_stack'.  Probably should only
1291    be called from within `PUSH_FAILURE_POINT'.  */
1292 #define PUSH_FAILURE_INT(item)                                  \
1293         fail_stack.stack[fail_stack.avail++].integer = (item)
1294
1295 /* Push a fail_stack_elt_t value onto the failure stack.
1296    Assumes the variable `fail_stack'.  Probably should only
1297    be called from within `PUSH_FAILURE_POINT'.  */
1298 #define PUSH_FAILURE_ELT(item)                                  \
1299         fail_stack.stack[fail_stack.avail++] =  (item)
1300
1301 /* These three POP... operations complement the three PUSH... operations.
1302    All assume that `fail_stack' is nonempty.  */
1303 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1304 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1305 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1306
1307 /* Used to omit pushing failure point id's when we're not debugging.  */
1308 #ifdef REGEX_DEBUG_FLAG
1309 #define DEBUG_PUSH PUSH_FAILURE_INT
1310 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1311 #else
1312 #define DEBUG_PUSH(item)
1313 #define DEBUG_POP(item_addr)
1314 #endif
1315
1316 /* Push the information about the state we will need
1317    if we ever fail back to it.
1318
1319    Requires variables fail_stack, regstart, regend, reg_info, and
1320    num_regs be declared.  DOUBLE_FAIL_STACK requires `destination' be
1321    declared.
1322
1323    Does `return FAILURE_CODE' if runs out of memory.  */
1324
1325 #if !defined (REGEX_MALLOC) && !defined (REL_ALLOC)
1326 #define DECLARE_DESTINATION char *destination
1327 #else
1328 #define DECLARE_DESTINATION DECLARE_NOTHING
1329 #endif
1330
1331 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code)   \
1332         do {                                                            \
1333                 DECLARE_DESTINATION;                                    \
1334                 /* Must be int, so when we don't save any registers, the\
1335                    arithmetic \ of 0 + -1 isn't done as unsigned.  */   \
1336                 int this_reg;                                           \
1337                                                                         \
1338                 DEBUG_STATEMENT (failure_id++);                         \
1339                 DEBUG_STATEMENT (nfailure_points_pushed++);             \
1340                 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1341                 DEBUG_PRINT2 ("  Before push, next avail: %lu\n",       \
1342                               (long unsigned int)(fail_stack).avail);   \
1343                 DEBUG_PRINT2 ("                     size: %lu\n",       \
1344                               (long unsigned int)(fail_stack).size);    \
1345                                                                         \
1346                 DEBUG_PRINT2 ("  slots needed: %d\n", NUM_FAILURE_ITEMS); \
1347                 DEBUG_PRINT2 ("     available: %ld\n",                  \
1348                               (long) REMAINING_AVAIL_SLOTS);            \
1349                                                                         \
1350                 /* Ensure we have enough space allocated for what       \
1351                  * we will push.  */                                    \
1352                 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) {     \
1353                         if (!DOUBLE_FAIL_STACK (fail_stack))            \
1354                                 return failure_code;                    \
1355                                                                         \
1356                         DEBUG_PRINT2 ("\n  Doubled stack; size now: %lu\n", \
1357                                       (unsigned long) (fail_stack).size); \
1358                         DEBUG_PRINT2 ("  slots available: %ld\n",       \
1359                                       (long) REMAINING_AVAIL_SLOTS);    \
1360                 }                                                       \
1361                                                                         \
1362                 /* Push the info, starting with the registers.  */      \
1363                 DEBUG_PRINT1 ("\n");                                    \
1364                                                                         \
1365                 for (this_reg = lowest_active_reg;                      \
1366                      this_reg <= highest_active_reg;                    \
1367                      this_reg++) {                                      \
1368                         DEBUG_PRINT2 ("  Pushing reg: %d\n", this_reg); \
1369                         DEBUG_STATEMENT (num_regs_pushed++);            \
1370                                                                         \
1371                         DEBUG_PRINT2("    start: %p\n",                 \
1372                                      regstart[this_reg]);               \
1373                         PUSH_FAILURE_POINTER(regstart[this_reg]);       \
1374                                                                         \
1375                         DEBUG_PRINT2 ("    end: %p\n",                  \
1376                                       regend[this_reg]);                \
1377                         PUSH_FAILURE_POINTER(regend[this_reg]); \
1378                                                                         \
1379                         DEBUG_PRINT2 ("    info: 0x%p\n      ",         \
1380                                       &reg_info[this_reg]);             \
1381                         DEBUG_PRINT2 (" match_null=%d",                 \
1382                                       REG_MATCH_NULL_STRING_P           \
1383                                       (reg_info[this_reg]));            \
1384                         DEBUG_PRINT2 (" active=%d",                     \
1385                                       IS_ACTIVE (reg_info[this_reg]));  \
1386                         DEBUG_PRINT2 (" matched_something=%d",          \
1387                                       MATCHED_SOMETHING                 \
1388                                       (reg_info[this_reg]));            \
1389                         DEBUG_PRINT2 (" ever_matched_something=%d",     \
1390                                       EVER_MATCHED_SOMETHING            \
1391                                       (reg_info[this_reg]));            \
1392                         DEBUG_PRINT1 ("\n");                            \
1393                         PUSH_FAILURE_ELT(reg_info[this_reg].word);      \
1394                 }                                                       \
1395                                                                         \
1396                 DEBUG_PRINT2 ("  Pushing  low active reg: %d\n",        \
1397                               lowest_active_reg);                       \
1398                 PUSH_FAILURE_INT (lowest_active_reg);                   \
1399                                                                         \
1400                 DEBUG_PRINT2 ("  Pushing high active reg: %d\n",        \
1401                               highest_active_reg);                      \
1402                 PUSH_FAILURE_INT (highest_active_reg);                  \
1403                                                                         \
1404                 DEBUG_PRINT2 ("  Pushing pattern 0x%lx: \n",            \
1405                               (long) pattern_place);                    \
1406                 DEBUG_PRINT_COMPILED_PATTERN(bufp, pattern_place, pend); \
1407                 PUSH_FAILURE_POINTER (pattern_place);                   \
1408                                                                         \
1409                 DEBUG_PRINT2("  Pushing string %p: `",                  \
1410                              string_place);                             \
1411                 DEBUG_PRINT_DOUBLE_STRING(string_place,                 \
1412                                           string1, size1,               \
1413                                           string2, size2);              \
1414                 DEBUG_PRINT1("'\n");                                    \
1415                 PUSH_FAILURE_POINTER (string_place);                    \
1416                                                                         \
1417                 DEBUG_PRINT2("  Pushing failure id: %u\n", failure_id); \
1418                 DEBUG_PUSH(failure_id);                                 \
1419         } while (0)
1420
1421 /* This is the number of items that are pushed and popped on the stack
1422    for each register.  */
1423 #define NUM_REG_ITEMS  3
1424
1425 /* Individual items aside from the registers.  */
1426 #ifdef DEBUG
1427 #define NUM_NONREG_ITEMS 5      /* Includes failure point id.  */
1428 #else
1429 #define NUM_NONREG_ITEMS 4
1430 #endif
1431
1432 /* We push at most this many items on the stack.  */
1433 /* We used to use (num_regs - 1), which is the number of registers
1434    this regexp will save; but that was changed to 5
1435    to avoid stack overflow for a regexp with lots of parens.  */
1436 #define MAX_FAILURE_ITEMS (5 * (NUM_REG_ITEMS + NUM_NONREG_ITEMS))
1437
1438 /* We actually push this many items.  */
1439 #define NUM_FAILURE_ITEMS                                               \
1440   ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS         \
1441     + NUM_NONREG_ITEMS)
1442
1443 /* How many items can still be added to the stack without overflowing it.  */
1444 #define REMAINING_AVAIL_SLOTS ((int) ((fail_stack).size - (fail_stack).avail))
1445
1446 /* Pops what PUSH_FAIL_STACK pushes.
1447
1448    We restore into the parameters, all of which should be lvalues:
1449      STR -- the saved data position.
1450      PAT -- the saved pattern position.
1451      LOW_REG, HIGH_REG -- the highest and lowest active registers.
1452      REGSTART, REGEND -- arrays of string positions.
1453      REG_INFO -- array of information about each subexpression.
1454
1455    Also assumes the variables `fail_stack' and (if debugging), `bufp',
1456    `pend', `string1', `size1', `string2', and `size2'.  */
1457
1458 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg,                  \
1459                           regstart, regend, reg_info)                   \
1460         do {                                                            \
1461                 DEBUG_STATEMENT (fail_stack_elt_t ffailure_id;)         \
1462                         int this_reg;                                   \
1463                 const unsigned char *string_temp;                       \
1464                                                                         \
1465                 assert (!FAIL_STACK_EMPTY ());                          \
1466                                                                         \
1467                 /* Remove failure points and point to                   \
1468                  * how many regs pushed.  */                            \
1469                 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n");                  \
1470                 DEBUG_PRINT2 ("  Before pop, next avail: %lu\n",        \
1471                               (unsigned long) fail_stack.avail);        \
1472                 DEBUG_PRINT2 ("                    size: %lu\n",        \
1473                               (unsigned long) fail_stack.size);         \
1474                                                                         \
1475                 assert (fail_stack.avail >= NUM_NONREG_ITEMS);          \
1476                                                                         \
1477                 DEBUG_POP (&ffailure_id.integer);                       \
1478                 DEBUG_PRINT2 ("  Popping failure id: %u\n",             \
1479                               * (unsigned int *) &ffailure_id);         \
1480                                                                         \
1481                 /* If the saved string location is NULL, it             \
1482                    came from an on_failure_keep_string_jump opcode,     \
1483                    and we want to throw away the saved NULL, thus       \
1484                    retaining our current position in the string.  */    \
1485                 string_temp = POP_FAILURE_POINTER ();                   \
1486                 if (string_temp != NULL)                                \
1487                         str = string_temp;                              \
1488                                                                         \
1489                 DEBUG_PRINT2("  Popping string %p: `", str);            \
1490                 DEBUG_PRINT_DOUBLE_STRING(str,                          \
1491                                           string1, size1,               \
1492                                           string2, size2);              \
1493                 DEBUG_PRINT1("'\n");                                    \
1494                                                                         \
1495                 pat = (unsigned char *) POP_FAILURE_POINTER();          \
1496                 DEBUG_PRINT2 ("  Popping pattern %p: ", pat);           \
1497                 DEBUG_PRINT_COMPILED_PATTERN(bufp, pat, pend);          \
1498                                                                         \
1499                 /* Restore register info.  */                           \
1500                 high_reg = (unsigned) POP_FAILURE_INT ();               \
1501                 DEBUG_PRINT2 ("  Popping high active reg: %d\n", high_reg); \
1502                                                                         \
1503                 low_reg = (unsigned) POP_FAILURE_INT ();                \
1504                 DEBUG_PRINT2 ("  Popping  low active reg: %d\n", low_reg); \
1505                                                                         \
1506                 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) { \
1507                         DEBUG_PRINT2 ("    Popping reg: %d\n", this_reg); \
1508                                                                         \
1509                         reg_info[this_reg].word = POP_FAILURE_ELT ();   \
1510                         DEBUG_PRINT2 ("      info: %p\n",               \
1511                                       &reg_info[this_reg]);             \
1512                                                                         \
1513                         regend[this_reg] = POP_FAILURE_POINTER ();      \
1514                         DEBUG_PRINT2 ("      end: %p\n",                \
1515                                       regend[this_reg]);                \
1516                                                                         \
1517                         regstart[this_reg] = POP_FAILURE_POINTER ();    \
1518                         DEBUG_PRINT2 ("      start: %p\n",              \
1519                                       regstart[this_reg]);              \
1520                 }                                                       \
1521                                                                         \
1522                 set_regs_matched_done = 0;                              \
1523                 DEBUG_STATEMENT (nfailure_points_popped++);             \
1524         } while (0)                     /* POP_FAILURE_POINT */
1525 \f
1526 /* Structure for per-register (a.k.a. per-group) information.
1527    Other register information, such as the
1528    starting and ending positions (which are addresses), and the list of
1529    inner groups (which is a bits list) are maintained in separate
1530    variables.
1531
1532    We are making a (strictly speaking) nonportable assumption here: that
1533    the compiler will pack our bit fields into something that fits into
1534    the type of `word', i.e., is something that fits into one item on the
1535    failure stack.  */
1536
1537 typedef union {
1538         fail_stack_elt_t word;
1539         struct {
1540                 /* This field is one if this group can match the empty string,
1541                    zero if not.  If not yet determined,  `MATCH_NULL_UNSET_VALUE'.  */
1542 #define MATCH_NULL_UNSET_VALUE 3
1543                 unsigned match_null_string_p:2;
1544                 unsigned is_active:1;
1545                 unsigned matched_something:1;
1546                 unsigned ever_matched_something:1;
1547         } bits;
1548 } register_info_type;
1549
1550 #define REG_MATCH_NULL_STRING_P(R)  ((R).bits.match_null_string_p)
1551 #define IS_ACTIVE(R)  ((R).bits.is_active)
1552 #define MATCHED_SOMETHING(R)  ((R).bits.matched_something)
1553 #define EVER_MATCHED_SOMETHING(R)  ((R).bits.ever_matched_something)
1554
1555 /* Call this when have matched a real character; it sets `matched' flags
1556    for the subexpressions which we are currently inside.  Also records
1557    that those subexprs have matched.  */
1558 #define SET_REGS_MATCHED()                                              \
1559   do                                                                    \
1560     {                                                                   \
1561       if (!set_regs_matched_done)                                       \
1562         {                                                               \
1563           Element_count r;                                              \
1564           set_regs_matched_done = 1;                                    \
1565           for (r = lowest_active_reg; r <= highest_active_reg; r++)     \
1566             {                                                           \
1567               MATCHED_SOMETHING (reg_info[r])                           \
1568                 = EVER_MATCHED_SOMETHING (reg_info[r])                  \
1569                 = 1;                                                    \
1570             }                                                           \
1571         }                                                               \
1572     }                                                                   \
1573   while (0)
1574
1575 /* Registers are set to a sentinel when they haven't yet matched.  */
1576 static unsigned char reg_unset_dummy;
1577 #define REG_UNSET_VALUE (&reg_unset_dummy)
1578 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1579 \f
1580 /* Subroutine declarations and macros for regex_compile.  */
1581
1582 /* Fetch the next character in the uncompiled pattern---translating it
1583    if necessary.  Also cast from a signed character in the constant
1584    string passed to us by the user to an unsigned char that we can use
1585    as an array index (in, e.g., `translate').  */
1586 #define PATFETCH(c)                                                     \
1587   do {                                                                  \
1588     PATFETCH_RAW (c);                                                   \
1589     c = TRANSLATE (c);                                                  \
1590   } while (0)
1591
1592 /* Fetch the next character in the uncompiled pattern, with no
1593    translation.  */
1594 #define PATFETCH_RAW(c)                                                 \
1595   do {if (p == pend) return REG_EEND;                                   \
1596     assert (p < pend);                                                  \
1597     c = charptr_emchar (p);                                             \
1598     INC_CHARPTR (p);                                                    \
1599   } while (0)
1600
1601 /* Go backwards one character in the pattern.  */
1602 #define PATUNFETCH DEC_CHARPTR (p)
1603
1604 #ifdef MULE
1605
1606 #define PATFETCH_EXTENDED(emch)                                         \
1607   do {if (p == pend) return REG_EEND;                                   \
1608     assert (p < pend);                                                  \
1609     emch = charptr_emchar ((const Bufbyte *) p);                        \
1610     INC_CHARPTR (p);                                                    \
1611     if (TRANSLATE_P (translate) && emch < 0x80)                         \
1612       emch = (Emchar) (unsigned char) RE_TRANSLATE (emch);              \
1613   } while (0)
1614
1615 #define PATFETCH_RAW_EXTENDED(emch)                                     \
1616   do {if (p == pend) return REG_EEND;                                   \
1617     assert (p < pend);                                                  \
1618     emch = charptr_emchar ((const Bufbyte *) p);                        \
1619     INC_CHARPTR (p);                                                    \
1620   } while (0)
1621
1622 #define PATUNFETCH_EXTENDED DEC_CHARPTR (p)
1623
1624 #define PATFETCH_EITHER(emch)                   \
1625   do {                                          \
1626     if (has_extended_chars)                     \
1627       PATFETCH_EXTENDED (emch);                 \
1628     else                                        \
1629       PATFETCH (emch);                          \
1630   } while (0)
1631
1632 #define PATFETCH_RAW_EITHER(emch)               \
1633   do {                                          \
1634     if (has_extended_chars)                     \
1635       PATFETCH_RAW_EXTENDED (emch);             \
1636     else                                        \
1637       PATFETCH_RAW (emch);                      \
1638   } while (0)
1639
1640 #define PATUNFETCH_EITHER                       \
1641   do {                                          \
1642     if (has_extended_chars)                     \
1643       PATUNFETCH_EXTENDED (emch);               \
1644     else                                        \
1645       PATUNFETCH (emch);                        \
1646   } while (0)
1647
1648 #else                           /* not MULE */
1649
1650 #define PATFETCH_EITHER(emch) PATFETCH (emch)
1651 #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch)
1652 #define PATUNFETCH_EITHER PATUNFETCH
1653
1654 #endif                          /* MULE */
1655
1656 /* If `translate' is non-null, return translate[D], else just D.  We
1657    cast the subscript to translate because some data is declared as
1658    `char *', to avoid warnings when a string constant is passed.  But
1659    when we use a character as a subscript we must make it unsigned.  */
1660 #define TRANSLATE(d) (TRANSLATE_P (translate) ? RE_TRANSLATE (d) : (d))
1661
1662 /* Macros for outputting the compiled pattern into `buffer'.  */
1663
1664 /* If the buffer isn't allocated when it comes in, use this.  */
1665 #define INIT_BUF_SIZE  32
1666
1667 /* Make sure we have at least N more bytes of space in buffer.  */
1668 #define GET_BUFFER_SPACE(n)                                             \
1669     while (buf_end - bufp->buffer + (n) > (ptrdiff_t) bufp->allocated)  \
1670       EXTEND_BUFFER ()
1671
1672 /* Make sure we have one more byte of buffer space and then add C to it.  */
1673 #define BUF_PUSH(c)                                                     \
1674   do {                                                                  \
1675     GET_BUFFER_SPACE (1);                                               \
1676     *buf_end++ = (unsigned char) (c);                                   \
1677   } while (0)
1678
1679 /* Ensure we have two more bytes of buffer space and then append C1 and C2.  */
1680 #define BUF_PUSH_2(c1, c2)                                              \
1681   do {                                                                  \
1682     GET_BUFFER_SPACE (2);                                               \
1683     *buf_end++ = (unsigned char) (c1);                                  \
1684     *buf_end++ = (unsigned char) (c2);                                  \
1685   } while (0)
1686
1687 /* As with BUF_PUSH_2, except for three bytes.  */
1688 #define BUF_PUSH_3(c1, c2, c3)                                          \
1689   do {                                                                  \
1690     GET_BUFFER_SPACE (3);                                               \
1691     *buf_end++ = (unsigned char) (c1);                                  \
1692     *buf_end++ = (unsigned char) (c2);                                  \
1693     *buf_end++ = (unsigned char) (c3);                                  \
1694   } while (0)
1695
1696 /* Store a jump with opcode OP at LOC to location TO.  We store a
1697    relative address offset by the three bytes the jump itself occupies.  */
1698 #define STORE_JUMP(op, loc, to) \
1699   store_op1 (op, loc, (to) - (loc) - 3)
1700
1701 /* Likewise, for a two-argument jump.  */
1702 #define STORE_JUMP2(op, loc, to, arg) \
1703   store_op2 (op, loc, (to) - (loc) - 3, arg)
1704
1705 /* Like `STORE_JUMP', but for inserting.  Assume `buf_end' is the
1706    buffer end.  */
1707 #define INSERT_JUMP(op, loc, to) \
1708   insert_op1 (op, loc, (to) - (loc) - 3, buf_end)
1709
1710 /* Like `STORE_JUMP2', but for inserting.  Assume `buf_end' is the
1711    buffer end.  */
1712 #define INSERT_JUMP2(op, loc, to, arg) \
1713   insert_op2 (op, loc, (to) - (loc) - 3, arg, buf_end)
1714
1715 /* This is not an arbitrary limit: the arguments which represent offsets
1716    into the pattern are two bytes long.  So if 2^16 bytes turns out to
1717    be too small, many things would have to change.  */
1718 #define MAX_BUF_SIZE (1L << 16)
1719
1720 /* Extend the buffer by twice its current size via realloc and
1721    reset the pointers that pointed into the old block to point to the
1722    correct places in the new one.  If extending the buffer results in it
1723    being larger than MAX_BUF_SIZE, then flag memory exhausted.  */
1724 #define EXTEND_BUFFER()                                                 \
1725         do {                                                            \
1726                 re_char *old_buffer = bufp->buffer;                     \
1727                 if (bufp->allocated == MAX_BUF_SIZE)                    \
1728                         return REG_ESIZE;                               \
1729                 bufp->allocated <<= 1;                                  \
1730                 if (bufp->allocated > MAX_BUF_SIZE)                     \
1731                         bufp->allocated = MAX_BUF_SIZE;                 \
1732                 bufp->buffer = (unsigned char *)xrealloc(               \
1733                         bufp->buffer, bufp->allocated);                 \
1734                 if (bufp->buffer == NULL) {                             \
1735                         return REG_ESPACE;                              \
1736                 }                                                       \
1737                 /* If the buffer moved, move all the pointers into it.  */ \
1738                 if (old_buffer != bufp->buffer) {                       \
1739                         buf_end = (buf_end - old_buffer) + bufp->buffer; \
1740                         begalt = (begalt - old_buffer) + bufp->buffer;  \
1741                         if (fixup_alt_jump) {                           \
1742                                 fixup_alt_jump =                        \
1743                                         (fixup_alt_jump - old_buffer) + \
1744                                         bufp->buffer;                   \
1745                         }                                               \
1746                         if (laststart) {                                \
1747                                 laststart = (laststart - old_buffer) +  \
1748                                         bufp->buffer;                   \
1749                         }                                               \
1750                         if (pending_exact) {                            \
1751                                 pending_exact =                         \
1752                                         (pending_exact - old_buffer) +  \
1753                                         bufp->buffer;                   \
1754                         }                                               \
1755                 }                                                       \
1756         } while (0)
1757
1758 /* Since we have one byte reserved for the register number argument to
1759    {start,stop}_memory, the maximum number of groups we can report
1760    things about is what fits in that byte.  */
1761 #define MAX_REGNUM 255
1762
1763 /* But patterns can have more than `MAX_REGNUM' registers.  We just
1764    ignore the excess.  */
1765 typedef unsigned regnum_t;
1766
1767 #define INIT_REG_TRANSLATE_SIZE 5
1768
1769 /* Macros for the compile stack.  */
1770
1771 /* Since offsets can go either forwards or backwards, this type needs to
1772    be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1.  */
1773 typedef int pattern_offset_t;
1774
1775 typedef struct {
1776         pattern_offset_t begalt_offset;
1777         pattern_offset_t fixup_alt_jump;
1778         pattern_offset_t inner_group_offset;
1779         pattern_offset_t laststart_offset;
1780         regnum_t regnum;
1781 } compile_stack_elt_t;
1782
1783 typedef struct {
1784         compile_stack_elt_t *stack;
1785         unsigned size;
1786         unsigned avail;         /* Offset of next open position.  */
1787 } compile_stack_type;
1788
1789 #define INIT_COMPILE_STACK_SIZE 32
1790
1791 #define COMPILE_STACK_EMPTY  (compile_stack.avail == 0)
1792 #define COMPILE_STACK_FULL  (compile_stack.avail == compile_stack.size)
1793
1794 /* The next available element.  */
1795 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1796
1797 /* Set the bit for character C in a bit vector.  */
1798 #define SET_LIST_BIT(c)                         \
1799   (buf_end[((unsigned char) (c)) / BYTEWIDTH]   \
1800    |= 1 << (((unsigned char) c) % BYTEWIDTH))
1801
1802 #ifdef MULE
1803
1804 /* Set the "bit" for character C in a range table. */
1805 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt)
1806
1807 /* Set the "bit" for character c in the appropriate table. */
1808 #define SET_EITHER_BIT(c)                       \
1809         do {                                    \
1810                 if (has_extended_chars)         \
1811                         SET_RANGETAB_BIT (c);   \
1812                 else                            \
1813                         SET_LIST_BIT (c);       \
1814         } while (0)
1815
1816 #else                           /* not MULE */
1817
1818 #define SET_EITHER_BIT(c) SET_LIST_BIT (c)
1819
1820 #endif
1821
1822 /* Get the next unsigned number in the uncompiled pattern.  */
1823 #define GET_UNSIGNED_NUMBER(num)                                        \
1824   { if (p != pend)                                                      \
1825      {                                                                  \
1826        PATFETCH (c);                                                    \
1827        while (ISDIGIT (c))                                              \
1828          {                                                              \
1829            if (num < 0)                                                 \
1830               num = 0;                                                  \
1831            num = num * 10 + c - '0';                                    \
1832            if (p == pend)                                               \
1833               break;                                                    \
1834            PATFETCH (c);                                                \
1835          }                                                              \
1836        }                                                                \
1837     }
1838
1839 #define CHAR_CLASS_MAX_LENGTH  9        /* Namely, `multibyte'.  */
1840
1841 \f
1842 static void store_op1(re_opcode_t op, unsigned char *loc, int arg);
1843 static void store_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1844 static void insert_op1(re_opcode_t op, unsigned char *loc, int arg,
1845                        unsigned char *end);
1846 static void insert_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2,
1847                        unsigned char *end);
1848 static re_bool at_begline_loc_p(re_char*, re_char*, reg_syntax_t);
1849 static re_bool at_endline_loc_p(re_char*, re_char*, int syntax);
1850 static re_bool group_in_compile_stack(compile_stack_type compile_stack,
1851                                       regnum_t regnum);
1852 static reg_errcode_t compile_range(re_char ** p_ptr, re_char * pend,
1853                                    RE_TRANSLATE_TYPE translate,
1854                                    reg_syntax_t syntax, unsigned char *b);
1855 #ifdef MULE
1856 static reg_errcode_t compile_extended_range(re_char ** p_ptr,
1857                                             re_char * pend,
1858                                             RE_TRANSLATE_TYPE translate,
1859                                             reg_syntax_t syntax,
1860                                             Lisp_Object rtab);
1861 #endif                          /* MULE */
1862 static re_bool group_match_null_string_p(unsigned char **p,
1863                                          unsigned char *end,
1864                                          register_info_type * reg_info);
1865 static re_bool alt_match_null_string_p(unsigned char *p, unsigned char *end,
1866                                        register_info_type * reg_info);
1867 static re_bool common_op_match_null_string_p(unsigned char **p,
1868                                              unsigned char *end,
1869                                              register_info_type * reg_info);
1870 static int bcmp_translate(const unsigned char *s1, const unsigned char *s2,
1871                           REGISTER int len, RE_TRANSLATE_TYPE translate);
1872 static int re_match_2_internal(struct re_pattern_buffer *bufp,
1873                                re_char * string1, int size1,
1874                                re_char * string2, int size2, int pos,
1875                                struct re_registers *regs, int stop);
1876 \f
1877 #ifndef MATCH_MAY_ALLOCATE
1878
1879 /* If we cannot allocate large objects within re_match_2_internal,
1880    we make the fail stack and register vectors global.
1881    The fail stack, we grow to the maximum size when a regexp
1882    is compiled.
1883    The register vectors, we adjust in size each time we
1884    compile a regexp, according to the number of registers it needs.  */
1885
1886 static fail_stack_type fail_stack;
1887
1888 /* Size with which the following vectors are currently allocated.
1889    That is so we can make them bigger as needed,
1890    but never make them smaller.  */
1891 static int regs_allocated_size;
1892
1893 static re_char **regstart, **regend;
1894 static re_char **old_regstart, **old_regend;
1895 static re_char **best_regstart, **best_regend;
1896 static register_info_type *reg_info;
1897 static re_char **reg_dummy;
1898 static register_info_type *reg_info_dummy;
1899
1900 /* Make the register vectors big enough for NUM_REGS registers,
1901    but don't make them smaller.  */
1902
1903 static void regex_grow_registers(int num_regs)
1904 {
1905         if (num_regs > regs_allocated_size) {
1906                 RETALLOC_IF(regstart, num_regs, re_char *);
1907                 RETALLOC_IF(regend, num_regs, re_char *);
1908                 RETALLOC_IF(old_regstart, num_regs, re_char *);
1909                 RETALLOC_IF(old_regend, num_regs, re_char *);
1910                 RETALLOC_IF(best_regstart, num_regs, re_char *);
1911                 RETALLOC_IF(best_regend, num_regs, re_char *);
1912                 RETALLOC_IF(reg_info, num_regs, register_info_type);
1913                 RETALLOC_IF(reg_dummy, num_regs, re_char *);
1914                 RETALLOC_IF(reg_info_dummy, num_regs, register_info_type);
1915
1916                 regs_allocated_size = num_regs;
1917         }
1918 }
1919
1920 #endif                          /* not MATCH_MAY_ALLOCATE */
1921
1922 /* auxiliary stuff, FSF theft */
1923 /* Character classes.  */
1924 typedef enum {
1925         RECC_ERROR = 0,
1926         RECC_ALNUM, RECC_ALPHA, RECC_WORD,
1927         RECC_GRAPH, RECC_PRINT,
1928         RECC_LOWER, RECC_UPPER,
1929         RECC_PUNCT, RECC_CNTRL,
1930         RECC_DIGIT, RECC_XDIGIT,
1931         RECC_BLANK, RECC_SPACE,
1932         RECC_MULTIBYTE, RECC_NONASCII,
1933         RECC_ASCII, RECC_UNIBYTE
1934 } re_wctype_t;
1935
1936 /* Map a string to the char class it names (if any).  */
1937 static inline re_wctype_t
1938 re_wctype(re_char *str)
1939 {
1940         const char *string = (const char*)str;
1941
1942         if (STREQ (string, "alnum")) {
1943                 return RECC_ALNUM;
1944         } else if (STREQ (string, "alpha")) {
1945                 return RECC_ALPHA;
1946         } else if (STREQ (string, "digit")) {
1947                 return RECC_DIGIT;
1948         } else if (STREQ (string, "graph")) {
1949                 return RECC_GRAPH;
1950         } else if (STREQ (string, "space")) {
1951                 return RECC_SPACE;
1952         } else if (STREQ (string, "upper")) {
1953                 return RECC_UPPER;
1954         } else if (STREQ (string, "lower")) {
1955                 return RECC_LOWER;
1956         } else if (STREQ (string, "word")) {
1957                 return RECC_WORD;
1958         } else if (STREQ (string, "print")) {
1959                 return RECC_PRINT;
1960         } else if (STREQ (string, "punct")) {
1961                 return RECC_PUNCT;
1962         } else if (STREQ (string, "xdigit")) {
1963                 return RECC_XDIGIT;
1964         } else if (STREQ (string, "cntrl")) {
1965                 return RECC_CNTRL;
1966         } else if (STREQ (string, "blank")) {
1967                 return RECC_BLANK;
1968         } else if (STREQ (string, "ascii")) {
1969                 return RECC_ASCII;
1970         } else if (STREQ (string, "nonascii")) {
1971                 return RECC_NONASCII;
1972         } else if (STREQ (string, "unibyte")) {
1973                 return RECC_UNIBYTE;
1974         } else if (STREQ (string, "multibyte")) {
1975                 return RECC_MULTIBYTE;
1976         } else {
1977                 return 0;
1978         }
1979 }
1980
1981 /* True if CH is in the char class CC.  */
1982 static inline bool
1983 re_iswctype(int ch, re_wctype_t cc)
1984 {
1985         switch (cc) {
1986         case RECC_ALNUM:
1987                 return ISALNUM (ch);
1988         case RECC_ALPHA:
1989                 return ISALPHA (ch);
1990         case RECC_BLANK:
1991                 return ISBLANK (ch);
1992         case RECC_CNTRL:
1993                 return ISCNTRL (ch);
1994         case RECC_DIGIT:
1995                 return ISDIGIT (ch);
1996         case RECC_GRAPH:
1997                 return ISGRAPH (ch);
1998         case RECC_LOWER:
1999                 return ISLOWER (ch);
2000         case RECC_PRINT:
2001                 return ISPRINT (ch);
2002         case RECC_PUNCT:
2003                 return ISPUNCT (ch);
2004         case RECC_SPACE:
2005                 return ISSPACE (ch);
2006         case RECC_UPPER:
2007                 return ISUPPER (ch);
2008         case RECC_XDIGIT:
2009                 return ISXDIGIT (ch);
2010         case RECC_ASCII:
2011                 return IS_REAL_ASCII (ch);
2012         case RECC_NONASCII:
2013                 return !IS_REAL_ASCII (ch);
2014         case RECC_WORD:
2015                 return ISWORD(ch);
2016         case RECC_UNIBYTE:
2017                 return ISUNIBYTE((unsigned int)ch);
2018         case RECC_MULTIBYTE:
2019                 return !ISUNIBYTE((unsigned int)ch);
2020         case RECC_ERROR:
2021                 return false;
2022         default:
2023                 abort();
2024         }
2025         return false;
2026 }
2027
2028 \f
2029 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2030    Returns one of error codes defined in `regex.h', or zero for success.
2031
2032    Assumes the `allocated' (and perhaps `buffer') and `translate'
2033    fields are set in BUFP on entry.
2034
2035    If it succeeds, results are put in BUFP (if it returns an error, the
2036    contents of BUFP are undefined):
2037      `buffer' is the compiled pattern;
2038      `syntax' is set to SYNTAX;
2039      `used' is set to the length of the compiled pattern;
2040      `fastmap_accurate' is zero;
2041      `re_ngroups' is the number of groups/subexpressions (including shy
2042         groups) in PATTERN;
2043      `re_nsub' is the number of non-shy groups in PATTERN;
2044      `not_bol' and `not_eol' are zero;
2045
2046    The `fastmap' and `newline_anchor' fields are neither
2047    examined nor set.  */
2048
2049 static reg_errcode_t
2050 regex_compile(re_char * pattern, int size, reg_syntax_t syntax,
2051               struct re_pattern_buffer *bufp)
2052 {
2053         /* We fetch characters from PATTERN here.  We declare these as int
2054            (or possibly long) so that chars above 127 can be used as
2055            array indices.  The macros that fetch a character from the pattern
2056            make sure to coerce to unsigned char before assigning, so we won't
2057            get bitten by negative numbers here. */
2058         /* XEmacs change: used to be unsigned char. */
2059         REGISTER EMACS_INT c, c1;
2060
2061         /* A random temporary spot in PATTERN.  */
2062         re_char *p1;
2063
2064         /* Points to the end of the buffer, where we should append.  */
2065         REGISTER unsigned char *buf_end;
2066
2067         /* Keeps track of unclosed groups.  */
2068         compile_stack_type compile_stack;
2069
2070         /* Points to the current (ending) position in the pattern.  */
2071         re_char *p = pattern;
2072         re_char *pend = pattern + size;
2073
2074         /* How to translate the characters in the pattern.  */
2075         RE_TRANSLATE_TYPE translate = bufp->translate;
2076
2077         /* Address of the count-byte of the most recently inserted `exactn'
2078            command.  This makes it possible to tell if a new exact-match
2079            character can be added to that command or if the character requires
2080            a new `exactn' command.  */
2081         unsigned char *pending_exact = 0;
2082
2083         /* Address of start of the most recently finished expression.
2084            This tells, e.g., postfix * where to find the start of its
2085            operand.  Reset at the beginning of groups and alternatives.  */
2086         unsigned char *laststart = 0;
2087
2088         /* Address of beginning of regexp, or inside of last group.  */
2089         unsigned char *begalt;
2090
2091         /* Place in the uncompiled pattern (i.e., the {) to
2092            which to go back if the interval is invalid.  */
2093         re_char *beg_interval;
2094
2095         /* Address of the place where a forward jump should go to the end of
2096            the containing expression.  Each alternative of an `or' -- except the
2097            last -- ends with a forward jump of this sort.  */
2098         unsigned char *fixup_alt_jump = 0;
2099
2100         /* Counts open-groups as they are encountered.  Remembered for the
2101            matching close-group on the compile stack, so the same register
2102            number is put in the stop_memory as the start_memory.  */
2103         regnum_t regnum = 0;
2104         auto inline void free_stack(void) __attribute__((always_inline));
2105
2106         /* we need to close over compile_stack */
2107         auto inline void free_stack(void)
2108         {
2109                 xfree(compile_stack.stack);
2110                 return;
2111         }
2112
2113 #ifdef REGEX_DEBUG_FLAG
2114         DEBUG_PRINT1("\nCompiling pattern: ");
2115         if (debug) {
2116                 int debug_count;
2117
2118                 for (debug_count = 0; debug_count < size; debug_count++)
2119                         putchar(pattern[debug_count]);
2120                 putchar('\n');
2121         }
2122 #endif                          /* DEBUG */
2123
2124         /* Initialize the compile stack.  */
2125         compile_stack.stack =
2126                 TALLOC(INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2127         if (compile_stack.stack == NULL) {
2128                 return REG_ESPACE;
2129         }
2130         compile_stack.size = INIT_COMPILE_STACK_SIZE;
2131         compile_stack.avail = 0;
2132
2133         /* Initialize the pattern buffer.  */
2134         bufp->syntax = syntax;
2135         bufp->fastmap_accurate = 0;
2136         bufp->not_bol = bufp->not_eol = 0;
2137
2138         /* Set `used' to zero, so that if we return an error, the pattern
2139            printer (for debugging) will think there's no pattern.  We reset it
2140            at the end.  */
2141         bufp->used = 0;
2142
2143         /* Always count groups, whether or not bufp->no_sub is set.  */
2144         bufp->re_nsub = 0;
2145         bufp->re_ngroups = 0;
2146
2147         /* Allocate index translation array if needed. */
2148         if (bufp->external_to_internal_register == 0) {
2149                 bufp->external_to_internal_register_size =
2150                     INIT_REG_TRANSLATE_SIZE;
2151                 RETALLOC(bufp->external_to_internal_register,
2152                          bufp->external_to_internal_register_size, int);
2153         }
2154         /* Initialize translations to impossible value to aid debugging. */
2155         {
2156                 int i;
2157
2158                 bufp->external_to_internal_register[0] = 0;
2159                 for (i = 1; i < bufp->external_to_internal_register_size; i++)
2160                         bufp->external_to_internal_register[i] =
2161                             (int)0xDEADBEEF;
2162         }
2163
2164 #if !defined (emacs) && !defined (SYNTAX_TABLE)
2165         /* Initialize the syntax table.  */
2166         init_syntax_once();
2167 #endif
2168
2169         if (bufp->allocated == 0) {
2170                 if (bufp->buffer) {
2171                         /* If zero allocated, but buffer is non-null, try to
2172                            realloc enough space.  This loses if buffer's address
2173                            is bogus, but that is the user's responsibility.  */
2174                         RETALLOC(bufp->buffer, INIT_BUF_SIZE, unsigned char);
2175                 } else {
2176                         /* Caller did not allocate a buffer.  Do it for them. */
2177                         bufp->buffer = TALLOC(INIT_BUF_SIZE, unsigned char);
2178                 }
2179                 if (!bufp->buffer) {
2180                         free_stack();
2181                         return REG_ESPACE;
2182                 }
2183
2184                 bufp->allocated = INIT_BUF_SIZE;
2185         }
2186
2187         begalt = buf_end = bufp->buffer;
2188
2189         /* Loop through the uncompiled pattern until we're at the end.  */
2190         while (p != pend) {
2191                 PATFETCH(c);
2192
2193                 switch (c) {
2194                 case '^': {
2195                         if (    /* If at start of pattern, it's an operator.  */
2196                                 p == pattern + 1
2197                                 /* If context independent, it's an operator.  */
2198                                 || syntax & RE_CONTEXT_INDEP_ANCHORS
2199                                 /* Otherwise, depends on what's come before.  */
2200                                 || at_begline_loc_p(pattern, p, syntax))
2201                                 BUF_PUSH(begline);
2202                         else
2203                                 goto normal_char;
2204                 }
2205                         break;
2206
2207                 case '$': {
2208                         if (    /* If at end of pattern, it's an
2209                                    operator.  */
2210                                 p == pend
2211                                 /* If context independent, it's an
2212                                    operator.  */
2213                                 || syntax & RE_CONTEXT_INDEP_ANCHORS
2214                                 /* Otherwise, depends on what's
2215                                    next.  */
2216                                 || at_endline_loc_p(p, pend, syntax))
2217                                 BUF_PUSH(endline);
2218                         else
2219                                 goto normal_char;
2220                 }
2221                         break;
2222
2223                 case '+':
2224                 case '?':
2225                         if ((syntax & RE_BK_PLUS_QM) ||
2226                             (syntax & RE_LIMITED_OPS)) {
2227                                 goto normal_char;
2228                         }
2229                         /* pardon? */
2230                 handle_plus:
2231                 case '*': {
2232                         re_bool zero_times_ok;
2233                         re_bool many_times_ok;
2234                         re_bool minimal;
2235
2236                         /* If there is no previous pattern... */
2237                         if (!laststart) {
2238                                 if (syntax & RE_CONTEXT_INVALID_OPS) {
2239                                         free_stack();
2240                                         return REG_BADRPT;
2241                                 } else if (!(syntax & RE_CONTEXT_INDEP_OPS)) {
2242                                         goto normal_char;
2243                                 }
2244                         }
2245
2246                         /* true means zero/many matches are allowed. */
2247                         zero_times_ok = c != '+';
2248                         many_times_ok = c != '?';
2249
2250                         /* true means match shortest string possible. */
2251                         minimal = false;
2252
2253                         /* If there is a sequence of repetition chars, collapse it
2254                            down to just one (the right one).  We can't combine
2255                            interval operators with these because of, e.g., `a{2}*',
2256                            which should only match an even number of `a's.  */
2257                         while (p != pend) {
2258                                 PATFETCH(c);
2259
2260                                 if (c == '*'
2261                                     || (!(syntax & RE_BK_PLUS_QM)
2262                                         && (c == '+' || c == '?'))) ;
2263
2264                                 else if (syntax & RE_BK_PLUS_QM
2265                                          && c == '\\') {
2266                                         if (p == pend) {
2267                                                 free_stack();
2268                                                 return REG_EESCAPE;
2269                                         }
2270
2271                                         PATFETCH(c1);
2272                                         if (!(c1 == '+' || c1 == '?')) {
2273                                                 PATUNFETCH;
2274                                                 PATUNFETCH;
2275                                                 break;
2276                                         }
2277
2278                                         c = c1;
2279                                 } else {
2280                                         PATUNFETCH;
2281                                         break;
2282                                 }
2283
2284                                 /* If we get here, we found another repeat
2285                                    character.  */
2286                                 if (!(syntax & RE_NO_MINIMAL_MATCHING)) {
2287                                         /* "*?" and "+?" and "??" are okay (and
2288                                            mean match minimally), but other
2289                                            sequences (such as "*??" and "+++")
2290                                            are rejected (reserved for future
2291                                            use). */
2292                                         if (minimal || c != '?') {
2293                                                 free_stack();
2294                                                 return REG_BADRPT;
2295                                         }
2296                                         minimal = true;
2297                                 } else {
2298                                         zero_times_ok |= c != '+';
2299                                         many_times_ok |= c != '?';
2300                                 }
2301                         }
2302
2303                         /* Star, etc. applied to an empty pattern is equivalent
2304                            to an empty pattern.  */
2305                         if (!laststart) {
2306                                 break;
2307                         }
2308
2309                         /* Now we know whether zero matches is allowed
2310                            and whether two or more matches is allowed
2311                            and whether we want minimal or maximal matching. */
2312                         if (minimal) {
2313                                 if (!many_times_ok) {
2314                                         /* "a??" becomes:
2315                                            0: /on_failure_jump to 6
2316                                            3: /jump to 9
2317                                            6: /exactn/1/A
2318                                            9: end of pattern.
2319                                         */
2320                                         GET_BUFFER_SPACE(6);
2321                                         INSERT_JUMP(jump, laststart,
2322                                                     buf_end + 3);
2323                                         buf_end += 3;
2324                                         INSERT_JUMP(on_failure_jump,
2325                                                     laststart,
2326                                                     laststart + 6);
2327                                         buf_end += 3;
2328                                 } else if (zero_times_ok) {
2329                                         /* "a*?" becomes:
2330                                            0: /jump to 6
2331                                            3: /exactn/1/A
2332                                            6: /on_failure_jump to 3
2333                                            9: end of pattern.
2334                                         */
2335                                         GET_BUFFER_SPACE(6);
2336                                         INSERT_JUMP(jump, laststart,
2337                                                     buf_end + 3);
2338                                         buf_end += 3;
2339                                         STORE_JUMP(on_failure_jump,
2340                                                    buf_end,
2341                                                    laststart + 3);
2342                                         buf_end += 3;
2343                                 } else {
2344                                         /* "a+?" becomes:
2345                                            0: /exactn/1/A
2346                                            3: /on_failure_jump to 0
2347                                            6: end of pattern.
2348                                         */
2349                                         GET_BUFFER_SPACE(3);
2350                                         STORE_JUMP(on_failure_jump,
2351                                                    buf_end, laststart);
2352                                         buf_end += 3;
2353                                 }
2354                         } else {
2355                                 /* Are we optimizing this jump?  */
2356                                 re_bool keep_string_p = false;
2357
2358                                 if (many_times_ok) {
2359                                         /* More than one repetition is allowed,
2360                                            so put in at the end a backward
2361                                            relative jump from `buf_end' to
2362                                            before the next jump we're going to
2363                                            put in below (which jumps from
2364                                            laststart to after this jump).
2365
2366                                            But if we are at the `*' in the exact
2367                                            sequence `.*\n', insert an
2368                                            unconditional jump backwards to the
2369                                            ., instead of the beginning of the
2370                                            loop.  This way we only push a
2371                                            failure point once, instead of every
2372                                            time through the loop.  */
2373                                         assert(p - 1 > pattern);
2374
2375                                         /* Allocate the space for the jump.  */
2376                                         GET_BUFFER_SPACE(3);
2377
2378                                         /* We know we are not at the first
2379                                            character of the pattern, because
2380                                            laststart was nonzero.  And we've
2381                                            already incremented `p', by the way,
2382                                            to be the character after the `*'.
2383                                            Do we have to do something analogous
2384                                            here for null bytes, because of
2385                                            RE_DOT_NOT_NULL? */
2386                                         if (*(p - 2) == '.' &&
2387                                             zero_times_ok &&
2388                                             p < pend &&
2389                                             *p == '\n' &&
2390                                             !(syntax & RE_DOT_NEWLINE)) {
2391                                                 /* We have .*\n.  */
2392                                                 STORE_JUMP(jump,
2393                                                            buf_end,
2394                                                            laststart);
2395                                                 keep_string_p = true;
2396                                         } else {
2397                                                 /* Anything else.  */
2398                                                 STORE_JUMP
2399                                                         (maybe_pop_jump,
2400                                                          buf_end,
2401                                                          laststart - 3);
2402                                         }
2403                                         /* We've added more stuff to the
2404                                            buffer.  */
2405                                         buf_end += 3;
2406                                 }
2407
2408                                 /* On failure, jump from laststart to buf_end +
2409                                    3, which will be the end of the buffer after
2410                                    this jump is inserted.  */
2411                                 GET_BUFFER_SPACE(3);
2412                                 INSERT_JUMP(keep_string_p ?
2413                                             on_failure_keep_string_jump
2414                                             : on_failure_jump,
2415                                             laststart, buf_end + 3);
2416                                 buf_end += 3;
2417
2418                                 if (!zero_times_ok) {
2419                                         /* At least one repetition is required,
2420                                            so insert a `dummy_failure_jump'
2421                                            before the initial `on_failure_jump'
2422                                            instruction of the loop. This effects
2423                                            a skip over that instruction the
2424                                            first time we hit that loop.  */
2425                                         GET_BUFFER_SPACE(3);
2426                                         INSERT_JUMP(dummy_failure_jump,
2427                                                     laststart,
2428                                                     laststart + 6);
2429                                         buf_end += 3;
2430                                 }
2431                         }
2432                         pending_exact = 0;
2433                 }
2434                         break;
2435
2436                 case '.':
2437                         laststart = buf_end;
2438                         BUF_PUSH(anychar);
2439                         break;
2440
2441                 case '[': {
2442                         /* XEmacs change: this whole section */
2443                         re_bool had_char_class = false;
2444 #ifdef MULE
2445                         re_bool has_extended_chars = false;
2446                         REGISTER Lisp_Object rtab = Qnil;
2447 #endif
2448
2449                         if (p == pend) {
2450                                 free_stack();
2451                                 return REG_EBRACK;
2452                         }
2453
2454                         /* Ensure that we have enough space to push a charset:
2455                            the opcode, the length count, and the bitset; 34
2456                            bytes in all.  */
2457                         GET_BUFFER_SPACE(34);
2458
2459                         laststart = buf_end;
2460
2461                         /* We test `*p == '^' twice, instead of using an if
2462                            statement, so we only need one BUF_PUSH.  */
2463                         BUF_PUSH(*p == '^' ? charset_not : charset);
2464                         if (*p == '^')
2465                                 p++;
2466
2467                         /* Remember the first position in the bracket
2468                            expression.  */
2469                         p1 = p;
2470
2471                         /* Push the number of bytes in the bitmap.  */
2472                         BUF_PUSH((1 << BYTEWIDTH) / BYTEWIDTH);
2473
2474                         /* Clear the whole map.  */
2475                         memset(buf_end, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2476
2477                         /* charset_not matches newline according to a syntax
2478                            bit.  */
2479                         if ((re_opcode_t) buf_end[-2] == charset_not
2480                             && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2481                                 SET_LIST_BIT('\n');
2482
2483 #ifdef MULE
2484                         start_over_with_extended:
2485                         if (has_extended_chars) {
2486                                 /* There are extended chars here, which means we
2487                                    need to start over and shift to unified
2488                                    range-table format. */
2489                                 if (buf_end[-2] == charset)
2490                                         buf_end[-2] = charset_mule;
2491                                 else
2492                                         buf_end[-2] = charset_mule_not;
2493                                 buf_end--;
2494                                 p = p1;
2495                                 /* go back to the beginning of the charset,
2496                                    after a possible ^. */
2497                                 rtab = Vthe_lisp_rangetab;
2498                                 Fclear_range_table(rtab);
2499
2500                                 /* charset_not matches newline according to a
2501                                    syntax bit.  */
2502                                 if ((re_opcode_t) buf_end[-1] ==
2503                                     charset_mule_not
2504                                     && (syntax &
2505                                         RE_HAT_LISTS_NOT_NEWLINE))
2506                                         SET_EITHER_BIT('\n');
2507                         }
2508 #endif                          /* MULE */
2509
2510                                 /* Read in characters and ranges, setting map bits.  */
2511                         for (;;) {
2512                                 if (p == pend) {
2513                                         free_stack();
2514                                         return REG_EBRACK;
2515                                 }
2516                                 PATFETCH(c);
2517
2518 #ifdef MULE
2519                                 if (c >= 0x80 && !has_extended_chars) {
2520                                         has_extended_chars = 1;
2521                                         /* Frumble-bumble, we've found some
2522                                            extended chars.  Need to start over,
2523                                            process everything using the general
2524                                            extended-char mechanism, and need to
2525                                            use charset_mule and charset_mule_not
2526                                            instead of charset and
2527                                            charset_not. */
2528                                         goto start_over_with_extended;
2529                                 }
2530 #endif                          /* MULE */
2531                                 /* \ might escape characters inside [...] and
2532                                    [^...].  */
2533                                 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) &&
2534                                     c == '\\') {
2535                                         if (p == pend) {
2536                                                 free_stack();
2537                                                 return REG_EESCAPE;
2538                                         }
2539
2540                                         PATFETCH(c1);
2541 #ifdef MULE
2542                                         if (c1 >= 0x80
2543                                             && !has_extended_chars) {
2544                                                 has_extended_chars = 1;
2545                                                 goto start_over_with_extended;
2546                                         }
2547 #endif                          /* MULE */
2548                                         SET_EITHER_BIT(c1);
2549                                         continue;
2550                                 }
2551
2552                                 /* Could be the end of the bracket
2553                                    expression.  If it's not (i.e., when
2554                                    the bracket expression is `[]' so
2555                                    far), the ']' character bit gets set
2556                                    way below.  */
2557                                 if (c == ']' && p != p1 + 1)
2558                                         break;
2559
2560                                 /* Look ahead to see if it's a range
2561                                    when the last thing was a character
2562                                    class.  */
2563                                 if (had_char_class && c == '-'
2564                                     && *p != ']') {
2565                                         free_stack();
2566                                         return REG_ERANGE;
2567                                 }
2568
2569                                 /* Look ahead to see if it's a range
2570                                    when the last thing was a character:
2571                                    if this is a hyphen not at the
2572                                    beginning or the end of a list, then
2573                                    it's the range operator.  */
2574                                 if (c == '-'
2575                                     && !(p - 2 >= pattern
2576                                          && p[-2] == '[')
2577                                     && !(p - 3 >= pattern
2578                                          && p[-3] == '['
2579                                          && p[-2] == '^')
2580                                     && *p != ']') {
2581                                         reg_errcode_t ret;
2582
2583 #ifdef MULE
2584                                         if (*(const unsigned char *)p >= 0x80
2585                                             && !has_extended_chars) {
2586                                                 has_extended_chars = 1;
2587                                                 goto start_over_with_extended;
2588                                         }
2589                                         if (has_extended_chars)
2590                                                 ret = compile_extended_range
2591                                                         (&p, pend, translate,
2592                                                          syntax, rtab);
2593                                         else
2594 #endif                          /* MULE */
2595                                                 ret = compile_range
2596                                                         (&p, pend, translate,
2597                                                          syntax, buf_end);
2598                                         if (ret != REG_NOERROR) {
2599                                                 free_stack();
2600                                                 return ret;
2601                                         }
2602                                 }
2603
2604                                 else if (p[0] == '-' && p[1] != ']') {
2605                                         /* This handles ranges made up of
2606                                            characters only.  */
2607                                         reg_errcode_t ret;
2608
2609                                         /* Move past the `-'.  */
2610                                         PATFETCH(c1);
2611
2612 #ifdef MULE
2613                                         if (*(const unsigned char*)p >= 0x80
2614                                             && !has_extended_chars) {
2615                                                 has_extended_chars = 1;
2616                                                 goto start_over_with_extended;
2617                                         }
2618                                         if (has_extended_chars)
2619                                                 ret = compile_extended_range(
2620                                                         &p, pend, translate,
2621                                                         syntax, rtab);
2622                                         else
2623 #endif                          /* MULE */
2624                                                 ret = compile_range(
2625                                                         &p, pend, translate,
2626                                                         syntax, buf_end);
2627                                         if (ret != REG_NOERROR) {
2628                                                 free_stack();
2629                                                 return ret;
2630                                         }
2631                                 }
2632
2633                                 /* See if we're at the beginning of a possible
2634                                    character class.  */
2635
2636                                 else if (syntax & RE_CHAR_CLASSES &&
2637                                          c == '[' && *p == ':') {       
2638                                         /* Leave room for the null.  */
2639                                         char str[CHAR_CLASS_MAX_LENGTH + 1];
2640
2641                                         PATFETCH(c);
2642                                         c1 = 0;
2643
2644                                         /* If pattern is `[[:'.  */
2645                                         if (p == pend) {
2646                                                 free_stack();
2647                                                 return REG_EBRACK;
2648                                         }
2649                                         for (;;) {
2650                                                 /* #### This code is unused.
2651                                                    Correctness is not checked
2652                                                    after TRT table change.  */
2653                                                 PATFETCH(c);
2654                                                 if (c == ':' || c == ']'
2655                                                     || p == pend
2656                                                     || c1 ==
2657                                                     CHAR_CLASS_MAX_LENGTH)
2658                                                         break;
2659                                                 str[c1++] = (char)c;
2660                                         }
2661                                         str[c1] = '\0';
2662
2663                                         /* If isn't a word bracketed by
2664                                            `[:' and `:]': undo the
2665                                            ending character, the
2666                                            letters, and leave the
2667                                            leading `:' and `[' (but set
2668                                            bits for them).  */
2669                                         if (c == ':' && *p == ']') {
2670                                                 re_wctype_t wct =
2671                                                         re_wctype(
2672                                                                 (unsigned char*)
2673                                                                 str);
2674
2675                                                 if (wct == RECC_ERROR) {
2676                                                         free_stack();
2677                                                         return REG_ECTYPE;
2678                                                 }
2679
2680                                                 /* Throw away the ] at
2681                                                    the end of the
2682                                                    character class.  */
2683                                                 PATFETCH(c);
2684
2685                                                 if (p == pend) {
2686                                                         free_stack();
2687                                                         return REG_EBRACK;
2688                                                 }
2689
2690                                                 for (int ch = 0;
2691                                                      ch < 1<<BYTEWIDTH;
2692                                                      ch++) {
2693                                                         /* rmsmacs has
2694                                                            this */
2695 #if 0
2696                                                         int translated =
2697                                                                 TRANSLATE(ch);
2698                                                         if (translated <
2699                                                             (1 << BYTEWIDTH) &&
2700                                                             re_iswctype(ch, wct)) {
2701                                                                 SET_EITHER_BIT(ch);
2702                                                         }
2703 #else
2704                                                         if (re_iswctype(ch, wct)) {
2705                                                                 SET_EITHER_BIT(ch);
2706                                                         }
2707 #endif
2708                                                 }
2709                                                 had_char_class = true;
2710                                         } else {
2711                                                 c1++;
2712                                                 while (c1--) {
2713                                                         PATUNFETCH;
2714                                                 }
2715                                                 SET_EITHER_BIT('[');
2716                                                 SET_EITHER_BIT(':');
2717                                                 had_char_class = false;
2718                                         }
2719                                 } else {
2720                                         had_char_class = false;
2721                                         SET_EITHER_BIT(c);
2722                                 }
2723                         }
2724
2725 #ifdef MULE
2726                         if (has_extended_chars) {
2727                                 /* We have a range table, not a bit vector. */
2728                                 int bytes_needed =
2729                                         unified_range_table_bytes_needed
2730                                         (rtab);
2731                                 GET_BUFFER_SPACE(bytes_needed);
2732                                 unified_range_table_copy_data(rtab,
2733                                                               buf_end);
2734                                 buf_end +=
2735                                         unified_range_table_bytes_used
2736                                         (buf_end);
2737                                 break;
2738                         }
2739 #endif                          /* MULE */
2740                                 /* Discard any (non)matching list bytes that are
2741                                    all 0 at the end of the map.  Decrease the
2742                                    map-length byte too.  */
2743                         while ((int)buf_end[-1] > 0
2744                                && buf_end[buf_end[-1] - 1] == 0)
2745                                 buf_end[-1]--;
2746                         buf_end += buf_end[-1];
2747                 }
2748                         break;
2749
2750                 case '(':
2751                         if (syntax & RE_NO_BK_PARENS)
2752                                 goto handle_open;
2753                         else
2754                                 goto normal_char;
2755
2756                 case ')':
2757                         if (syntax & RE_NO_BK_PARENS)
2758                                 goto handle_close;
2759                         else
2760                                 goto normal_char;
2761
2762                 case '\n':
2763                         if (syntax & RE_NEWLINE_ALT)
2764                                 goto handle_alt;
2765                         else
2766                                 goto normal_char;
2767
2768                 case '|':
2769                         if (syntax & RE_NO_BK_VBAR)
2770                                 goto handle_alt;
2771                         else
2772                                 goto normal_char;
2773
2774                 case '{':
2775                         if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2776                                 goto handle_interval;
2777                         else
2778                                 goto normal_char;
2779
2780                 case '\\':
2781                         if (p == pend) {
2782                                 free_stack();
2783                                 return REG_EESCAPE;
2784                         }
2785
2786                         /* Do not translate the character after the \, so that we can
2787                            distinguish, e.g., \B from \b, even if we normally would
2788                            translate, e.g., B to b.  */
2789                         PATFETCH_RAW(c);
2790