XDG Compliant user package tree -- doc updates.
[sxemacs] / src / regex.c
1 /* Extended regular expression matching and search library,
2    version 0.12, extended for XEmacs.
3    (Implements POSIX draft P10003.2/D11.2, except for
4    internationalization features.)
5
6    Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
7    Copyright (C) 1995 Sun Microsystems, Inc.
8    Copyright (C) 1995 Ben Wing.
9
10 This file is part of SXEmacs
11
12 SXEmacs is free software: you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation, either version 3 of the License, or
15 (at your option) any later version.
16
17 SXEmacs is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20 GNU General Public License for more details.
21
22 You should have received a copy of the GNU General Public License
23 along with this program.  If not, see <http://www.gnu.org/licenses/>. */
24
25
26 /* Synched up with: FSF 19.29. */
27
28 /* Changes made for XEmacs:
29
30    (1) the REGEX_BEGLINE_CHECK code from the XEmacs v18 regex routines
31        was added.  This causes a huge speedup in font-locking.
32    (2) Rel-alloc is disabled when the MMAP version of rel-alloc is
33        being used, because it's too slow -- all those calls to mmap()
34        add humongous overhead.
35    (3) Lots and lots of changes for Mule.  They are bracketed by
36        `#ifdef MULE' or with comments that have `XEmacs' in them.
37  */
38
39 #ifdef HAVE_CONFIG_H
40 #include <config.h>
41 #endif
42
43 #ifndef REGISTER                /* Rigidly enforced as of 20.3 */
44 #define REGISTER
45 #endif
46
47 #ifndef _GNU_SOURCE
48 #define _GNU_SOURCE 1
49 #endif
50
51 #ifdef emacs
52 /* Converts the pointer to the char to BEG-based offset from the start.  */
53 #define PTR_TO_OFFSET(d) (MATCHING_IN_FIRST_STRING                      \
54                           ? (d) - string1 : (d) - (string2 - size1))
55 #else
56 #define PTR_TO_OFFSET(d) 0
57 #endif
58
59 /* We assume non-Mule if emacs isn't defined. */
60 #ifndef emacs
61 #undef MULE
62 #endif
63
64 /* We need this for `regex.h', and perhaps for the Emacs include files.  */
65 #include <sys/types.h>
66
67 /* This is for other GNU distributions with internationalized messages.  */
68 #if defined (I18N3) && (defined (HAVE_LIBINTL_H) || defined (_LIBC))
69 # include <libintl.h>
70 #else
71 # define gettext(msgid) (msgid)
72 #endif
73
74 /* XEmacs: define this to add in a speedup for patterns anchored at
75    the beginning of a line.  Keep the ifdefs so that it's easier to
76    tell where/why this code has diverged from v19. */
77 #define REGEX_BEGLINE_CHECK
78
79 /* XEmacs: the current mmap-based ralloc handles small blocks very
80    poorly, so we disable it here. */
81
82 #if (defined (REL_ALLOC) && defined (HAVE_MMAP)) || defined(DOUG_LEA_MALLOC)
83 # undef REL_ALLOC
84 #endif
85
86 /* The `emacs' switch turns on certain matching commands
87    that make sense only in Emacs. */
88 #ifdef emacs
89
90 #include "lisp.h"
91 #include "buffer.h"
92 #include "syntax.h"
93
94 #ifdef MULE
95
96 Lisp_Object Vthe_lisp_rangetab;
97
98 void complex_vars_of_regex(void)
99 {
100         Vthe_lisp_rangetab = Fmake_range_table();
101         staticpro(&Vthe_lisp_rangetab);
102 }
103
104 #else  /* not MULE */
105
106 void complex_vars_of_regex(void)
107 {
108 }
109
110 #endif  /* MULE */
111
112 #define RE_TRANSLATE(ch) TRT_TABLE_OF (translate, (Emchar) ch)
113 #define TRANSLATE_P(tr) (!NILP (tr) && CHAR_TABLEP(tr))
114
115 /* just massage that xfree thing */
116 #undef xfree
117 #if defined HAVE_BDWGC && defined EF_USE_BDWGC
118 # define xfree          zfree
119 #else  /* !BDWGC */
120 # define xfree          yfree
121 #endif  /* BDWGC */
122
123 #else  /* !emacs */
124
125 /* If we are not linking with Emacs proper,
126    we can't use the relocating allocator
127    even if config.h says that we can.  */
128 #undef REL_ALLOC
129
130 #if defined (STDC_HEADERS) || defined (_LIBC)
131 # include <stdlib.h>
132 # include <stdbool.h>
133 #else
134 char *malloc();
135 char *realloc();
136 #endif
137
138 /* Types normally included via lisp.h */
139 #include <stddef.h>             /* for ptrdiff_t */
140
141 #ifdef REGEX_MALLOC
142 #ifndef DECLARE_NOTHING
143 #define DECLARE_NOTHING struct nosuchstruct
144 #endif
145 #endif
146
147 typedef int Emchar;
148
149 #define charptr_emchar(str)             ((Emchar) (str)[0])
150
151 #define INC_CHARPTR(p) ((p)++)
152 #define DEC_CHARPTR(p) ((p)--)
153
154 #include <string.h>
155
156 /* Define the syntax stuff for \<, \>, etc.  */
157
158 /* This must be nonzero for the wordchar and notwordchar pattern
159    commands in re_match_2.  */
160 #ifndef Sword
161 #define Sword 1
162 #endif
163
164 #ifdef SYNTAX_TABLE
165
166 extern char *re_syntax_table;
167
168 #else                           /* not SYNTAX_TABLE */
169
170 /* How many characters in the character set.  */
171 #define CHAR_SET_SIZE 256
172
173 static char re_syntax_table[CHAR_SET_SIZE];
174
175 static void init_syntax_once(void)
176 {
177         static int done = 0;
178
179         if (!done) {
180                 const char *word_syntax_chars =
181                     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
182
183                 memset(re_syntax_table, 0, sizeof(re_syntax_table));
184
185                 while (*word_syntax_chars)
186                         re_syntax_table[(unsigned int)(*word_syntax_chars++)] =
187                             Sword;
188
189                 done = 1;
190         }
191 }
192
193 #endif                          /* SYNTAX_TABLE */
194
195 #define SYNTAX_UNSAFE(ignored, c) re_syntax_table[c]
196 #undef SYNTAX_FROM_CACHE
197 #define SYNTAX_FROM_CACHE SYNTAX_UNSAFE
198
199 #define RE_TRANSLATE(c) translate[(unsigned char) (c)]
200 #define TRANSLATE_P(tr) tr
201
202 #if defined HAVE_BDWGC && defined EF_USE_BDWGC
203 # if defined HAVE_GC_GC_H
204 #  include "gc/gc.h"
205 # elif defined HAVE_GC_H
206 #  include "gc.h"
207 # endif
208 # define xmalloc        GC_MALLOC
209 # define xmalloc_atomic GC_MALLOC_ATOMIC
210 # define xrealloc       GC_REALLOC
211 # define xfree          GC_FREE
212 # define xstrdup        GC_STRDUP
213 #else  /* !HAVE_BDWGC */
214 # define xmalloc        malloc
215 # define xmalloc_atomic malloc
216 # define xrealloc       realloc
217 # define xfree          free
218 # define xstrdup        strdup
219 #endif  /* HAVE_BDWGC */
220
221 #endif  /* emacs */
222
223
224 /* Under XEmacs, this is needed because we don't define it elsewhere. */
225 #ifdef SWITCH_ENUM_BUG
226 #define SWITCH_ENUM_CAST(x) ((int)(x))
227 #else
228 #define SWITCH_ENUM_CAST(x) (x)
229 #endif
230 \f
231 /* Get the interface, including the syntax bits.  */
232 #include "regex.h"
233
234 /* isalpha etc. are used for the character classes.  */
235 #include <ctype.h>
236
237 /* Jim Meyering writes:
238
239    "... Some ctype macros are valid only for character codes that
240    isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
241    using /bin/cc or gcc but without giving an ansi option).  So, all
242    ctype uses should be through macros like ISPRINT...  If
243    STDC_HEADERS is defined, then autoconf has verified that the ctype
244    macros don't need to be guarded with references to isascii. ...
245    Defining isascii to 1 should let any compiler worth its salt
246    eliminate the && through constant folding."  */
247
248 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
249 #define ISASCII_1(c) 1
250 #else
251 #define ISASCII_1(c) isascii(c)
252 #endif
253
254 #ifdef MULE
255 /* The IS*() macros can be passed any character, including an extended
256    one.  We need to make sure there are no crashes, which would occur
257    otherwise due to out-of-bounds array references. */
258 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c))
259 #else
260 #define ISASCII(c) ISASCII_1 (c)
261 #endif                          /* MULE */
262
263 #ifdef isblank
264 #define ISBLANK(c) (ISASCII (c) && isblank (c))
265 #else
266 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
267 #endif
268 #ifdef isgraph
269 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
270 #else
271 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
272 #endif
273
274 /* stolen from FSF/src/charset.h */
275 #define SINGLE_BYTE_CHAR_P(c)   (((unsigned int)(c) & 0xFF) == (c))
276 /* 1 if C is an ASCII character.  */
277 #define IS_REAL_ASCII(c) ((c) < 128)
278 /* 1 if C is a unibyte character.  */
279 #define ISUNIBYTE(c)    (SINGLE_BYTE_CHAR_P(c))
280
281 #define ISPRINT(c) (ISASCII (c) && isprint (c))
282 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
283 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
284 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
285 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
286 #define ISLOWER(c) (ISASCII (c) && islower (c))
287 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
288 #define ISSPACE(c) (ISASCII (c) && isspace (c))
289 #define ISUPPER(c) (ISASCII (c) && isupper (c))
290 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
291
292 #if defined emacs
293 static inline bool
294 ISWORD(int c)
295         __attribute__((always_inline));
296 static inline bool
297 ISWORD(int c)
298 {
299         Lisp_Object ct = regex_emacs_buffer->mirror_syntax_table;
300
301         return (SYNTAX(XCHAR_TABLE(ct), c) == Sword);
302 }
303 #else
304 # define ISWORD(c)      (ISALPHA(c))
305 #endif  /* emacs */
306
307 #ifndef NULL
308 #define NULL (void *)0
309 #endif
310
311 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
312    since ours (we hope) works properly with all combinations of
313    machines, compilers, `char' and `unsigned char' argument types.
314    (Per Bothner suggested the basic approach.)  */
315 #undef SIGN_EXTEND_CHAR
316 #if __STDC__
317 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
318 #else                           /* not __STDC__ */
319 /* As in Harbison and Steele.  */
320 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
321 #endif
322 \f
323 /* Should we use malloc or alloca?  If REGEX_MALLOC is not defined, we
324    use `alloca' instead of `malloc'.  This is because using malloc in
325    re_search* or re_match* could cause memory leaks when C-g is used in
326    Emacs; also, malloc is slower and causes storage fragmentation.  On
327    the other hand, malloc is more portable, and easier to debug.
328
329    Because we sometimes use alloca, some routines have to be macros,
330    not functions -- `alloca'-allocated space disappears at the end of the
331    function it is called in.  */
332
333 #ifdef REGEX_MALLOC
334
335 #define REGEX_ALLOCATE  xmalloc
336 #define REGEX_REALLOCATE(source, osize, nsize) xrealloc(source, nsize)
337 #define REGEX_FREE      xfree
338
339 #else  /* !REGEX_MALLOC */
340
341 /* Emacs already defines alloca, sometimes.  */
342 #ifndef alloca
343
344 /* Make alloca work the best possible way.  */
345 #ifdef __GNUC__
346 #define alloca __builtin_alloca
347 #else                           /* not __GNUC__ */
348 #if HAVE_ALLOCA_H
349 #include <alloca.h>
350 #else                           /* not __GNUC__ or HAVE_ALLOCA_H */
351 #ifndef _AIX                    /* Already did AIX, up at the top.  */
352 void *alloca();
353 #endif                          /* not _AIX */
354 #endif                          /* HAVE_ALLOCA_H */
355 #endif                          /* __GNUC__ */
356
357 #endif                          /* not alloca */
358
359 #define REGEX_ALLOCATE alloca
360
361 /* Assumes a `char *destination' variable.  */
362 #define REGEX_REALLOCATE(source, osize, nsize)                          \
363   (destination = (char *) alloca (nsize),                               \
364    memmove (destination, source, osize),                                \
365    destination)
366
367 /* No need to do anything to free, after alloca.  */
368 #define REGEX_FREE(arg) ((void)0)       /* Do nothing!  But inhibit gcc warning.  */
369
370 #endif  /* REGEX_MALLOC */
371
372 /* Define how to allocate the failure stack.  */
373
374 #ifdef REL_ALLOC
375 #error not again
376 #define REGEX_ALLOCATE_STACK(size)                              \
377         r_alloc ((char **) &failure_stack_ptr, (size))
378 #define REGEX_REALLOCATE_STACK(source, osize, nsize)            \
379         r_re_alloc ((char **) &failure_stack_ptr, (nsize))
380 #define REGEX_FREE_STACK(ptr)                                   \
381         r_alloc_free ((void **) &failure_stack_ptr)
382
383 #else  /* !REL_ALLOC */
384
385 #ifdef REGEX_MALLOC
386
387 #define REGEX_ALLOCATE_STACK    xmalloc
388 #define REGEX_REALLOCATE_STACK(source, osize, nsize) xrealloc(source, nsize)
389 #define REGEX_FREE_STACK        xfree
390
391 #else  /* !REGEX_MALLOC */
392
393 #define REGEX_ALLOCATE_STACK alloca
394
395 #define REGEX_REALLOCATE_STACK(source, osize, nsize)                    \
396    REGEX_REALLOCATE (source, osize, nsize)
397 /* No need to explicitly free anything.  */
398 #define REGEX_FREE_STACK(arg)
399
400 #endif  /* REGEX_MALLOC */
401 #endif  /* REL_ALLOC */
402
403 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
404    `string1' or just past its end.  This works if PTR is NULL, which is
405    a good thing.  */
406 #define FIRST_STRING_P(ptr)                                     \
407   (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
408
409 /* (Re)Allocate N items of type T using malloc, or fail.  */
410 #define TALLOC(n, t)                            \
411         ((t *)xmalloc((n) * sizeof (t)))
412 #define TALLOC_ATOMIC(n, t)                     \
413         ((t *)xmalloc_atomic((n) * sizeof (t)))
414 #define RETALLOC(addr, n, t)                                    \
415         ((addr) = (t *)xrealloc(addr, (n) * sizeof (t)))
416 #define RETALLOC_IF(addr, n, t)                                 \
417         if (addr) {                                             \
418                 RETALLOC((addr), (n), t);                       \
419         } else                                                  \
420                 (addr) = TALLOC ((n), t)
421 #define REGEX_TALLOC(n, t)                              \
422         ((t *)REGEX_ALLOCATE((n) * sizeof (t)))
423
424 #define BYTEWIDTH 8             /* In bits.  */
425
426 #define STREQ(s1, s2) (strcmp (s1, s2) == 0)
427
428 #undef MAX
429 #undef MIN
430 #define MAX(a, b) ((a) > (b) ? (a) : (b))
431 #define MIN(a, b) ((a) < (b) ? (a) : (b))
432
433 /* Type of source-pattern and string chars.  */
434 typedef const unsigned char re_char;
435
436 typedef char re_bool;
437 #define false 0
438 #define true 1
439 \f
440 /* These are the command codes that appear in compiled regular
441    expressions.  Some opcodes are followed by argument bytes.  A
442    command code can specify any interpretation whatsoever for its
443    arguments.  Zero bytes may appear in the compiled regular expression.  */
444
445 typedef enum {
446         no_op = 0,
447
448         /* Succeed right away--no more backtracking.  */
449         succeed,
450
451         /* Followed by one byte giving n, then by n literal bytes.  */
452         exactn,
453
454         /* Matches any (more or less) character.  */
455         anychar,
456
457         /* Matches any one char belonging to specified set.  First
458            following byte is number of bitmap bytes.  Then come bytes
459            for a bitmap saying which chars are in.  Bits in each byte
460            are ordered low-bit-first.  A character is in the set if its
461            bit is 1.  A character too large to have a bit in the map is
462            automatically not in the set.  */
463         charset,
464
465         /* Same parameters as charset, but match any character that is
466            not one of those specified.  */
467         charset_not,
468
469         /* Start remembering the text that is matched, for storing in a
470            register.  Followed by one byte with the register number, in
471            the range 1 to the pattern buffer's re_ngroups
472            field.  Then followed by one byte with the number of groups
473            inner to this one.  (This last has to be part of the
474            start_memory only because we need it in the on_failure_jump
475            of re_match_2.)  */
476         start_memory,
477
478         /* Stop remembering the text that is matched and store it in a
479            memory register.  Followed by one byte with the register
480            number, in the range 1 to `re_ngroups' in the
481            pattern buffer, and one byte with the number of inner groups,
482            just like `start_memory'.  (We need the number of inner
483            groups here because we don't have any easy way of finding the
484            corresponding start_memory when we're at a stop_memory.)  */
485         stop_memory,
486
487         /* Match a duplicate of something remembered. Followed by one
488            byte containing the register number.  */
489         duplicate,
490
491         /* Fail unless at beginning of line.  */
492         begline,
493
494         /* Fail unless at end of line.  */
495         endline,
496
497         /* Succeeds if at beginning of buffer (if emacs) or at beginning
498            of string to be matched (if not).  */
499         begbuf,
500
501         /* Analogously, for end of buffer/string.  */
502         endbuf,
503
504         /* Followed by two byte relative address to which to jump.  */
505         jump,
506
507         /* Same as jump, but marks the end of an alternative.  */
508         jump_past_alt,
509
510         /* Followed by two-byte relative address of place to resume at
511            in case of failure.  */
512         on_failure_jump,
513
514         /* Like on_failure_jump, but pushes a placeholder instead of the
515            current string position when executed.  */
516         on_failure_keep_string_jump,
517
518         /* Throw away latest failure point and then jump to following
519            two-byte relative address.  */
520         pop_failure_jump,
521
522         /* Change to pop_failure_jump if know won't have to backtrack to
523            match; otherwise change to jump.  This is used to jump
524            back to the beginning of a repeat.  If what follows this jump
525            clearly won't match what the repeat does, such that we can be
526            sure that there is no use backtracking out of repetitions
527            already matched, then we change it to a pop_failure_jump.
528            Followed by two-byte address.  */
529         maybe_pop_jump,
530
531         /* Jump to following two-byte address, and push a dummy failure
532            point. This failure point will be thrown away if an attempt
533            is made to use it for a failure.  A `+' construct makes this
534            before the first repeat.  Also used as an intermediary kind
535            of jump when compiling an alternative.  */
536         dummy_failure_jump,
537
538         /* Push a dummy failure point and continue.  Used at the end of
539            alternatives.  */
540         push_dummy_failure,
541
542         /* Followed by two-byte relative address and two-byte number n.
543            After matching N times, jump to the address upon failure.  */
544         succeed_n,
545
546         /* Followed by two-byte relative address, and two-byte number n.
547            Jump to the address N times, then fail.  */
548         jump_n,
549
550         /* Set the following two-byte relative address to the
551            subsequent two-byte number.  The address *includes* the two
552            bytes of number.  */
553         set_number_at,
554
555         wordchar,               /* Matches any word-constituent character.  */
556         notwordchar,            /* Matches any char that is not a word-constituent.  */
557
558         wordbeg,                /* Succeeds if at word beginning.  */
559         wordend,                /* Succeeds if at word end.  */
560
561         wordbound,              /* Succeeds if at a word boundary.  */
562         notwordbound            /* Succeeds if not at a word boundary.  */
563 #ifdef emacs
564             , before_dot,       /* Succeeds if before point.  */
565         at_dot,                 /* Succeeds if at point.  */
566         after_dot,              /* Succeeds if after point.  */
567
568         /* Matches any character whose syntax is specified.  Followed by
569            a byte which contains a syntax code, e.g., Sword.  */
570         syntaxspec,
571
572         /* Matches any character whose syntax is not that specified.  */
573         notsyntaxspec
574 #endif                          /* emacs */
575 #ifdef MULE
576             /* need extra stuff to be able to properly work with XEmacs/Mule
577                characters (which may take up more than one byte) */
578             , charset_mule,     /* Matches any character belonging to specified set.
579                                    The set is stored in "unified range-table
580                                    format"; see rangetab.c.  Unlike the `charset'
581                                    opcode, this can handle arbitrary characters. */
582
583         charset_mule_not        /* Same parameters as charset_mule, but match any
584                                    character that is not one of those specified.  */
585             /* 97/2/17 jhod: The following two were merged back in from the Mule
586                2.3 code to enable some language specific processing */
587             , categoryspec,     /* Matches entries in the character category tables */
588         notcategoryspec         /* The opposite of the above */
589 #endif                          /* MULE */
590 } re_opcode_t;
591 \f
592 /* Common operations on the compiled pattern.  */
593
594 /* Store NUMBER in two contiguous bytes starting at DESTINATION.  */
595
596 #define STORE_NUMBER(destination, number)                               \
597   do {                                                                  \
598     (destination)[0] = (number) & 0377;                                 \
599     (destination)[1] = (number) >> 8;                                   \
600   } while (0)
601
602 /* Same as STORE_NUMBER, except increment DESTINATION to
603    the byte after where the number is stored.  Therefore, DESTINATION
604    must be an lvalue.  */
605
606 #define STORE_NUMBER_AND_INCR(destination, number)                      \
607   do {                                                                  \
608     STORE_NUMBER (destination, number);                                 \
609     (destination) += 2;                                                 \
610   } while (0)
611
612 /* Put into DESTINATION a number stored in two contiguous bytes starting
613    at SOURCE.  */
614
615 #define EXTRACT_NUMBER(destination, source)                             \
616   do {                                                                  \
617     (destination) = *(source) & 0377;                                   \
618     (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8;           \
619   } while (0)
620
621 #ifdef REGEX_DEBUG_FLAG
622 static void extract_number(int *dest, re_char * source)
623 {
624         int temp = SIGN_EXTEND_CHAR(*(source + 1));
625         *dest = *source & 0377;
626         *dest += temp << 8;
627 }
628
629 #ifndef EXTRACT_MACROS          /* To debug the macros.  */
630 #undef EXTRACT_NUMBER
631 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
632 #endif                          /* not EXTRACT_MACROS */
633
634 #endif                          /* REGEX_DEBUG_FLAG */
635
636 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
637    SOURCE must be an lvalue.  */
638
639 #define EXTRACT_NUMBER_AND_INCR(destination, source)                    \
640         do {                                                            \
641                 EXTRACT_NUMBER (destination, source);                   \
642                 (source) += 2;                                          \
643         } while (0)
644
645 #ifdef REGEX_DEBUG_FLAG
646 static void
647 extract_number_and_incr(int *destination, unsigned char **source)
648 {
649         extract_number(destination, *source);
650         *source += 2;
651 }
652
653 #ifndef EXTRACT_MACROS
654 #undef EXTRACT_NUMBER_AND_INCR
655 #define EXTRACT_NUMBER_AND_INCR(dest, src)      \
656         extract_number_and_incr (&dest, &src)
657 #endif                          /* not EXTRACT_MACROS */
658
659 #endif                          /* REGEX_DEBUG_FLAG */
660 \f
661 /* If REGEX_DEBUG_FLAG is defined, Regex prints many voluminous messages about
662    what it is doing (if the variable `debug' is nonzero).  If linked with the
663    main program in `iregex.c', you can enter patterns and strings interactively.
664    And if linked with the main program in `main.c' and the other test files, you
665    can run the already-written tests.  */
666
667 #if defined (REGEX_DEBUG_FLAG)
668
669 /* We use standard I/O for debugging.  */
670 #include <stdio.h>
671
672 #ifndef emacs
673 /* XEmacs provides its own version of assert() */
674 /* It is useful to test things that ``must'' be true when debugging.  */
675 #include <assert.h>
676 #endif
677
678 #if 0
679 /* no point in having this one, since we do not offer any accessor to this */
680 static int debug = 0;
681 #endif
682
683 #define DEBUG_STATEMENT(e)              e
684 #define DEBUG_PRINT1(x)                 printf(x)
685 #define DEBUG_PRINT2(x1, x2)            printf(x1, x2)
686 #define DEBUG_PRINT3(x1, x2, x3)        printf(x1, x2, x3)
687 #define DEBUG_PRINT4(x1, x2, x3, x4)    printf(x1, x2, x3, x4)
688 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)                           \
689         print_partial_compiled_pattern(s, e)
690 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)                  \
691         print_double_string (w, s1, sz1, s2, sz2)
692
693 /* Print the fastmap in human-readable form.  */
694
695 static void print_fastmap(char *fastmap)
696 {
697         unsigned was_a_range = 0;
698         unsigned i = 0;
699
700         while (i < (1 << BYTEWIDTH)) {
701                 if (fastmap[i++]) {
702                         was_a_range = 0;
703                         putchar(i - 1);
704                         while (i < (1 << BYTEWIDTH) && fastmap[i]) {
705                                 was_a_range = 1;
706                                 i++;
707                         }
708                         if (was_a_range) {
709                                 putchar('-');
710                                 putchar(i - 1);
711                         }
712                 }
713         }
714         putchar('\n');
715 }
716
717 /* Print a compiled pattern string in human-readable form, starting at
718    the START pointer into it and ending just before the pointer END.  */
719
720 /* can't help it ... the ugly mule code changes `start' upon alignment,
721  * so this can't have the const qualifier ... bugger */
722 static void
723 print_partial_compiled_pattern(unsigned char *start, re_char *end)
724 {
725         int mcnt, mcnt2;
726         unsigned char *p = start;
727         re_char *pend = end;
728
729         if (start == NULL) {
730                 puts("(null)");
731                 return;
732         }
733
734         /* Loop over pattern commands.  */
735         while (p < pend) {
736                 printf("%ld:\t", (long)(p - start));
737
738                 switch ((const re_opcode_t)*p++) {
739                 case no_op:
740                         printf("/no_op");
741                         break;
742
743                 case exactn:
744                         mcnt = *p++;
745                         printf("/exactn/%d", mcnt);
746                         do {
747                                 putchar('/');
748                                 putchar(*p++);
749                         }
750                         while (--mcnt);
751                         break;
752
753                 case start_memory:
754                         mcnt = *p++;
755                         printf("/start_memory/%d/%d", mcnt, *p++);
756                         break;
757
758                 case stop_memory:
759                         mcnt = *p++;
760                         printf("/stop_memory/%d/%d", mcnt, *p++);
761                         break;
762
763                 case duplicate:
764                         printf("/duplicate/%d", *p++);
765                         break;
766
767                 case anychar:
768                         printf("/anychar");
769                         break;
770
771                 case charset:
772                 case charset_not: {
773                         REGISTER int c, last = -100;
774                         REGISTER int in_range = 0;
775
776                         printf("/charset [%s",
777                                (re_opcode_t) * (p - 1) == charset_not
778                                ? "^" : "");
779
780                         assert(p + *p < pend);
781
782                         for (c = 0; c < 256; c++)
783                                 if (((unsigned char)(c / 8) < *p)
784                                     && (p[1 + (c / 8)] &
785                                         (1 << (c % 8)))) {
786                                         /* Are we starting a range?  */
787                                         if (last + 1 == c && !in_range) {
788                                                 putchar('-');
789                                                 in_range = 1;
790                                         }
791                                         /* Have we broken a range?  */
792                                         else if (last + 1 != c
793                                                  && in_range) {
794                                                 putchar(last);
795                                                 in_range = 0;
796                                         }
797
798                                         if (!in_range)
799                                                 putchar(c);
800
801                                         last = c;
802                                 }
803
804                         if (in_range)
805                                 putchar(last);
806
807                         putchar(']');
808
809                         p += 1 + *p;
810                         break;
811                 }
812
813 #ifdef MULE
814                 case charset_mule:
815                 case charset_mule_not: {
816                         int nentries, i;
817
818                         printf("/charset_mule [%s",
819                                (re_opcode_t) * (p - 1) == charset_mule_not
820                                ? "^" : "");
821                         /* u_r_t_nentries() CHANGES its arg, why on earth
822                          * is this marked const here then? :O -hrop */
823                         nentries = unified_range_table_nentries(p);
824                         for (i = 0; i < nentries; i++) {
825                                 EMACS_INT first, last;
826                                 Lisp_Object dummy_val;
827
828                                 /* u_r_t_get_range CHANGES its arg ...
829                                  * p can't be const thence */
830                                 unified_range_table_get_range(p, i,
831                                                               &first,
832                                                               &last,
833                                                               &dummy_val);
834                                 if (first < 0x100)
835                                         putchar(first);
836                                 else
837                                         printf("(0x%lx)", (long)first);
838                                 if (first != last) {
839                                         putchar('-');
840                                         if (last < 0x100)
841                                                 putchar(last);
842                                         else
843                                                 printf("(0x%lx)",
844                                                        (long)last);
845                                 }
846                         }
847                         putchar(']');
848                         p += unified_range_table_bytes_used(p);
849                 }
850                         break;
851 #endif
852
853                 case begline:
854                         printf("/begline");
855                         break;
856
857                 case endline:
858                         printf("/endline");
859                         break;
860
861                 case on_failure_jump:
862                         extract_number_and_incr(&mcnt, &p);
863                         printf("/on_failure_jump to %ld",
864                                (long)(p + mcnt - start));
865                         break;
866
867                 case on_failure_keep_string_jump:
868                         extract_number_and_incr(&mcnt, &p);
869                         printf("/on_failure_keep_string_jump to %ld",
870                                (long)(p + mcnt - start));
871                         break;
872
873                 case dummy_failure_jump:
874                         extract_number_and_incr(&mcnt, &p);
875                         printf("/dummy_failure_jump to %ld",
876                                (long)(p + mcnt - start));
877                         break;
878
879                 case push_dummy_failure:
880                         printf("/push_dummy_failure");
881                         break;
882
883                 case maybe_pop_jump:
884                         extract_number_and_incr(&mcnt, &p);
885                         printf("/maybe_pop_jump to %ld",
886                                (long)(p + mcnt - start));
887                         break;
888
889                 case pop_failure_jump:
890                         extract_number_and_incr(&mcnt, &p);
891                         printf("/pop_failure_jump to %ld",
892                                (long)(p + mcnt - start));
893                         break;
894
895                 case jump_past_alt:
896                         extract_number_and_incr(&mcnt, &p);
897                         printf("/jump_past_alt to %ld",
898                                (long)(p + mcnt - start));
899                         break;
900
901                 case jump:
902                         extract_number_and_incr(&mcnt, &p);
903                         printf("/jump to %ld", (long)(p + mcnt - start));
904                         break;
905
906                 case succeed_n:
907                         extract_number_and_incr(&mcnt, &p);
908                         extract_number_and_incr(&mcnt2, &p);
909                         printf("/succeed_n to %ld, %d times",
910                                (long)(p + mcnt - start), mcnt2);
911                         break;
912
913                 case jump_n:
914                         extract_number_and_incr(&mcnt, &p);
915                         extract_number_and_incr(&mcnt2, &p);
916                         printf("/jump_n to %ld, %d times",
917                                (long)(p + mcnt - start), mcnt2);
918                         break;
919
920                 case set_number_at:
921                         extract_number_and_incr(&mcnt, &p);
922                         extract_number_and_incr(&mcnt2, &p);
923                         printf("/set_number_at location %ld to %d",
924                                (long)(p + mcnt - start), mcnt2);
925                         break;
926
927                 case wordbound:
928                         printf("/wordbound");
929                         break;
930
931                 case notwordbound:
932                         printf("/notwordbound");
933                         break;
934
935                 case wordbeg:
936                         printf("/wordbeg");
937                         break;
938
939                 case wordend:
940                         printf("/wordend");
941
942 #ifdef emacs
943                 case before_dot:
944                         printf("/before_dot");
945                         break;
946
947                 case at_dot:
948                         printf("/at_dot");
949                         break;
950
951                 case after_dot:
952                         printf("/after_dot");
953                         break;
954
955                 case syntaxspec:
956                         printf("/syntaxspec");
957                         mcnt = *p++;
958                         printf("/%d", mcnt);
959                         break;
960
961                 case notsyntaxspec:
962                         printf("/notsyntaxspec");
963                         mcnt = *p++;
964                         printf("/%d", mcnt);
965                         break;
966
967 #ifdef MULE
968 /* 97/2/17 jhod Mule category patch */
969                 case categoryspec:
970                         printf("/categoryspec");
971                         mcnt = *p++;
972                         printf("/%d", mcnt);
973                         break;
974
975                 case notcategoryspec:
976                         printf("/notcategoryspec");
977                         mcnt = *p++;
978                         printf("/%d", mcnt);
979                         break;
980 /* end of category patch */
981 #endif                          /* MULE */
982 #endif                          /* emacs */
983
984                 case wordchar:
985                         printf("/wordchar");
986                         break;
987
988                 case notwordchar:
989                         printf("/notwordchar");
990                         break;
991
992                 case begbuf:
993                         printf("/begbuf");
994                         break;
995
996                 case endbuf:
997                         printf("/endbuf");
998                         break;
999
1000                 case succeed:
1001                 default:
1002                         printf("?%d", *(p - 1));
1003                 }
1004
1005                 putchar('\n');
1006         }
1007
1008         printf("%ld:\tend of pattern.\n", (long)(p - start));
1009 }
1010
1011 static void
1012 print_compiled_pattern(struct re_pattern_buffer *bufp)
1013 {
1014 #if 0
1015         /* cant be const */
1016         re_char *buffer = bufp->buffer;
1017 #else
1018         unsigned char *buffer = bufp->buffer;
1019 #endif
1020
1021         print_partial_compiled_pattern(buffer, buffer + bufp->used);
1022         printf("%ld bytes used/%ld bytes allocated.\n", bufp->used,
1023                bufp->allocated);
1024
1025         if (bufp->fastmap_accurate && bufp->fastmap) {
1026                 printf("fastmap: ");
1027                 print_fastmap(bufp->fastmap);
1028         }
1029
1030         printf("re_nsub: %ld\t", (long)bufp->re_nsub);
1031         printf("re_ngroups: %ld\t", (long)bufp->re_ngroups);
1032         printf("regs_alloc: %d\t", bufp->regs_allocated);
1033         printf("can_be_null: %d\t", bufp->can_be_null);
1034         printf("newline_anchor: %d\n", bufp->newline_anchor);
1035         printf("no_sub: %d\t", bufp->no_sub);
1036         printf("not_bol: %d\t", bufp->not_bol);
1037         printf("not_eol: %d\t", bufp->not_eol);
1038         printf("syntax: %d\n", bufp->syntax);
1039         /* Perhaps we should print the translate table?  */
1040         /* and maybe the category table? */
1041
1042         if (bufp->external_to_internal_register) {
1043                 int i;
1044
1045                 printf("external_to_internal_register:\n");
1046                 for (i = 0; i <= bufp->re_nsub; i++) {
1047                         if (i > 0)
1048                                 printf(", ");
1049                         printf("%d -> %d", i,
1050                                bufp->external_to_internal_register[i]);
1051                 }
1052                 printf("\n");
1053         }
1054 }
1055
1056 static void
1057 print_double_string(re_char * where, re_char * string1, int size1,
1058                     re_char * string2, int size2)
1059 {
1060         if (where == NULL)
1061                 printf("(null)");
1062         else {
1063                 Element_count this_char;
1064
1065                 if (FIRST_STRING_P(where)) {
1066                         for (this_char = where - string1; this_char < size1;
1067                              this_char++)
1068                                 putchar(string1[this_char]);
1069
1070                         where = string2;
1071                 }
1072
1073                 for (this_char = where - string2; this_char < size2;
1074                      this_char++)
1075                         putchar(string2[this_char]);
1076         }
1077 }
1078
1079 #else                           /* not REGEX_DEBUG_FLAG */
1080
1081 #undef assert
1082 #define assert(e)
1083
1084 #define DEBUG_STATEMENT(e)
1085 #define DEBUG_PRINT1(x)
1086 #define DEBUG_PRINT2(x1, x2)
1087 #define DEBUG_PRINT3(x1, x2, x3)
1088 #define DEBUG_PRINT4(x1, x2, x3, x4)
1089 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1090 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1091
1092 #endif                          /* REGEX_DEBUG_FLAG */
1093 \f
1094 /* Set by `re_set_syntax' to the current regexp syntax to recognize.  Can
1095    also be assigned to arbitrarily: each pattern buffer stores its own
1096    syntax, so it can be changed between regex compilations.  */
1097 /* This has no initializer because initialized variables in Emacs
1098    become read-only after dumping.  */
1099 reg_syntax_t re_syntax_options;
1100
1101 /* Specify the precise syntax of regexps for compilation.  This provides
1102    for compatibility for various utilities which historically have
1103    different, incompatible syntaxes.
1104
1105    The argument SYNTAX is a bit mask comprised of the various bits
1106    defined in regex.h.  We return the old syntax.  */
1107
1108 reg_syntax_t re_set_syntax(reg_syntax_t syntax)
1109 {
1110         reg_syntax_t ret = re_syntax_options;
1111
1112         re_syntax_options = syntax;
1113         return ret;
1114 }
1115 \f
1116 /* This table gives an error message for each of the error codes listed
1117    in regex.h.  Obviously the order here has to be same as there.
1118    POSIX doesn't require that we do anything for REG_NOERROR,
1119    but why not be nice?  */
1120
1121 static const char *re_error_msgid[] = {
1122         "Success",              /* REG_NOERROR */
1123         "No match",             /* REG_NOMATCH */
1124         "Invalid regular expression",   /* REG_BADPAT */
1125         "Invalid collation character",  /* REG_ECOLLATE */
1126         "Invalid character class name", /* REG_ECTYPE */
1127         "Trailing backslash",   /* REG_EESCAPE */
1128         "Invalid back reference",       /* REG_ESUBREG */
1129         "Unmatched [ or [^",    /* REG_EBRACK */
1130         "Unmatched ( or \\(",   /* REG_EPAREN */
1131         "Unmatched \\{",        /* REG_EBRACE */
1132         "Invalid content of \\{\\}",    /* REG_BADBR */
1133         "Invalid range end",    /* REG_ERANGE */
1134         "Memory exhausted",     /* REG_ESPACE */
1135         "Invalid preceding regular expression", /* REG_BADRPT */
1136         "Premature end of regular expression",  /* REG_EEND */
1137         "Regular expression too big",   /* REG_ESIZE */
1138         "Unmatched ) or \\)",   /* REG_ERPAREN */
1139 #ifdef emacs
1140         "Invalid syntax designator",    /* REG_ESYNTAX */
1141 #endif
1142 #ifdef MULE
1143         "Ranges may not span charsets", /* REG_ERANGESPAN */
1144         "Invalid category designator",  /* REG_ECATEGORY */
1145 #endif
1146 };
1147 \f
1148 /* Avoiding alloca during matching, to placate r_alloc.  */
1149
1150 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1151    searching and matching functions should not call alloca.  On some
1152    systems, alloca is implemented in terms of malloc, and if we're
1153    using the relocating allocator routines, then malloc could cause a
1154    relocation, which might (if the strings being searched are in the
1155    ralloc heap) shift the data out from underneath the regexp
1156    routines.
1157
1158    Here's another reason to avoid allocation: Emacs
1159    processes input from X in a signal handler; processing X input may
1160    call malloc; if input arrives while a matching routine is calling
1161    malloc, then we're scrod.  But Emacs can't just block input while
1162    calling matching routines; then we don't notice interrupts when
1163    they come in.  So, Emacs blocks input around all regexp calls
1164    except the matching calls, which it leaves unprotected, in the
1165    faith that they will not malloc.  */
1166
1167 /* Normally, this is fine.  */
1168 #define MATCH_MAY_ALLOCATE
1169
1170 /* When using GNU C, we are not REALLY using the C alloca, no matter
1171    what config.h may say.  So don't take precautions for it.  */
1172 #ifdef __GNUC__
1173 #undef C_ALLOCA
1174 #endif
1175
1176 /* The match routines may not allocate if (1) they would do it with malloc
1177    and (2) it's not safe for them to use malloc.
1178    Note that if REL_ALLOC is defined, matching would not use malloc for the
1179    failure stack, but we would still use it for the register vectors;
1180    so REL_ALLOC should not affect this.  */
1181 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1182 #undef MATCH_MAY_ALLOCATE
1183 #endif
1184 \f
1185 /* Failure stack declarations and macros; both re_compile_fastmap and
1186    re_match_2 use a failure stack.  These have to be macros because of
1187    REGEX_ALLOCATE_STACK.  */
1188
1189 /* Number of failure points for which to initially allocate space
1190    when matching.  If this number is exceeded, we allocate more
1191    space, so it is not a hard limit.  */
1192 #ifndef INIT_FAILURE_ALLOC
1193 #define INIT_FAILURE_ALLOC 20
1194 #endif
1195
1196 /* Roughly the maximum number of failure points on the stack.  Would be
1197    exactly that if always used MAX_FAILURE_SPACE each time we failed.
1198    This is a variable only so users of regex can assign to it; we never
1199    change it ourselves.  */
1200 #if defined (MATCH_MAY_ALLOCATE) || defined (REGEX_MALLOC)
1201 /* 4400 was enough to cause a crash on Alpha OSF/1,
1202    whose default stack limit is 2mb.  */
1203 int re_max_failures = 50000;
1204 #else
1205 int re_max_failures = 2000;
1206 #endif
1207
1208 union fail_stack_elt {
1209         const unsigned char *pointer;
1210         int integer;
1211 };
1212
1213 typedef union fail_stack_elt fail_stack_elt_t;
1214
1215 typedef struct {
1216         fail_stack_elt_t *stack;
1217         Element_count size;
1218         Element_count avail;    /* Offset of next open position.  */
1219 } fail_stack_type;
1220
1221 #define FAIL_STACK_EMPTY()     (fail_stack.avail == 0)
1222 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1223 #define FAIL_STACK_FULL()      (fail_stack.avail == fail_stack.size)
1224
1225 /* Define macros to initialize and free the failure stack.
1226    Do `return -2' if the alloc fails.  */
1227
1228 #ifdef MATCH_MAY_ALLOCATE
1229 #define INIT_FAIL_STACK()                                               \
1230         do {                                                            \
1231                 fail_stack.stack = (fail_stack_elt_t *)                 \
1232                         REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC *      \
1233                                               sizeof (fail_stack_elt_t)); \
1234                                                                         \
1235                 if (fail_stack.stack == NULL)                           \
1236                         return -2;                                      \
1237                                                                         \
1238                 fail_stack.size = INIT_FAILURE_ALLOC;                   \
1239                 fail_stack.avail = 0;                                   \
1240         } while (0)
1241
1242 #define RESET_FAIL_STACK()  REGEX_FREE_STACK (fail_stack.stack)
1243 #else
1244 #define INIT_FAIL_STACK()                       \
1245         do {                                    \
1246                 fail_stack.avail = 0;           \
1247         } while (0)
1248
1249 #define RESET_FAIL_STACK()
1250 #endif
1251
1252 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1253
1254    Return 1 if succeeds, and 0 if either ran out of memory
1255    allocating space for it or it was already too large.
1256
1257    REGEX_REALLOCATE_STACK requires `destination' be declared.   */
1258
1259 #define DOUBLE_FAIL_STACK(fail_stack)                                   \
1260   ((int) (fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS        \
1261    ? 0                                                                  \
1262    : ((fail_stack).stack = (fail_stack_elt_t *)                         \
1263         REGEX_REALLOCATE_STACK ((fail_stack).stack,                     \
1264           (fail_stack).size * sizeof (fail_stack_elt_t),                \
1265           ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)),        \
1266                                                                         \
1267       (fail_stack).stack == NULL                                        \
1268       ? 0                                                               \
1269       : ((fail_stack).size <<= 1,                                       \
1270          1)))
1271
1272 /* Push pointer POINTER on FAIL_STACK.
1273    Return 1 if was able to do so and 0 if ran out of memory allocating
1274    space to do so.  */
1275 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK)                            \
1276   ((FAIL_STACK_FULL ()                                                  \
1277     && !DOUBLE_FAIL_STACK (FAIL_STACK))                                 \
1278    ? 0                                                                  \
1279    : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER,       \
1280       1))
1281
1282 /* Push a pointer value onto the failure stack.
1283    Assumes the variable `fail_stack'.  Probably should only
1284    be called from within `PUSH_FAILURE_POINT'.  */
1285 #define PUSH_FAILURE_POINTER(item)                                      \
1286         fail_stack.stack[fail_stack.avail++].pointer =                  \
1287                 (const unsigned char*)(item)
1288
1289 /* This pushes an integer-valued item onto the failure stack.
1290    Assumes the variable `fail_stack'.  Probably should only
1291    be called from within `PUSH_FAILURE_POINT'.  */
1292 #define PUSH_FAILURE_INT(item)                                  \
1293         fail_stack.stack[fail_stack.avail++].integer = (item)
1294
1295 /* Push a fail_stack_elt_t value onto the failure stack.
1296    Assumes the variable `fail_stack'.  Probably should only
1297    be called from within `PUSH_FAILURE_POINT'.  */
1298 #define PUSH_FAILURE_ELT(item)                                  \
1299         fail_stack.stack[fail_stack.avail++] =  (item)
1300
1301 /* These three POP... operations complement the three PUSH... operations.
1302    All assume that `fail_stack' is nonempty.  */
1303 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1304 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1305 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1306
1307 /* Used to omit pushing failure point id's when we're not debugging.  */
1308 #ifdef REGEX_DEBUG_FLAG
1309 #define DEBUG_PUSH PUSH_FAILURE_INT
1310 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1311 #else
1312 #define DEBUG_PUSH(item)
1313 #define DEBUG_POP(item_addr)
1314 #endif
1315
1316 /* Push the information about the state we will need
1317    if we ever fail back to it.
1318
1319    Requires variables fail_stack, regstart, regend, reg_info, and
1320    num_regs be declared.  DOUBLE_FAIL_STACK requires `destination' be
1321    declared.
1322
1323    Does `return FAILURE_CODE' if runs out of memory.  */
1324
1325 #if !defined (REGEX_MALLOC) && !defined (REL_ALLOC)
1326 #define DECLARE_DESTINATION char *destination
1327 #else
1328 #define DECLARE_DESTINATION DECLARE_NOTHING
1329 #endif
1330
1331 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code)   \
1332         do {                                                            \
1333                 DECLARE_DESTINATION;                                    \
1334                 /* Must be int, so when we don't save any registers, the\
1335                    arithmetic \ of 0 + -1 isn't done as unsigned.  */   \
1336                 int this_reg;                                           \
1337                                                                         \
1338                 DEBUG_STATEMENT (failure_id++);                         \
1339                 DEBUG_STATEMENT (nfailure_points_pushed++);             \
1340                 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1341                 DEBUG_PRINT2 ("  Before push, next avail: %lu\n",       \
1342                               (long unsigned int)(fail_stack).avail);   \
1343                 DEBUG_PRINT2 ("                     size: %lu\n",       \
1344                               (long unsigned int)(fail_stack).size);    \
1345                                                                         \
1346                 DEBUG_PRINT2 ("  slots needed: %d\n", NUM_FAILURE_ITEMS); \
1347                 DEBUG_PRINT2 ("     available: %ld\n",                  \
1348                               (long) REMAINING_AVAIL_SLOTS);            \
1349                                                                         \
1350                 /* Ensure we have enough space allocated for what       \
1351                  * we will push.  */                                    \
1352                 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) {     \
1353                         if (!DOUBLE_FAIL_STACK (fail_stack))            \
1354                                 return failure_code;                    \
1355                                                                         \
1356                         DEBUG_PRINT2 ("\n  Doubled stack; size now: %lu\n", \
1357                                       (unsigned long) (fail_stack).size); \
1358                         DEBUG_PRINT2 ("  slots available: %ld\n",       \
1359                                       (long) REMAINING_AVAIL_SLOTS);    \
1360                 }                                                       \
1361                                                                         \
1362                 /* Push the info, starting with the registers.  */      \
1363                 DEBUG_PRINT1 ("\n");                                    \
1364                                                                         \
1365                 for (this_reg = lowest_active_reg;                      \
1366                      this_reg <= highest_active_reg;                    \
1367                      this_reg++) {                                      \
1368                         DEBUG_PRINT2 ("  Pushing reg: %d\n", this_reg); \
1369                         DEBUG_STATEMENT (num_regs_pushed++);            \
1370                                                                         \
1371                         DEBUG_PRINT2("    start: %p\n",                 \
1372                                      regstart[this_reg]);               \
1373                         PUSH_FAILURE_POINTER(regstart[this_reg]);       \
1374                                                                         \
1375                         DEBUG_PRINT2 ("    end: %p\n",                  \
1376                                       regend[this_reg]);                \
1377                         PUSH_FAILURE_POINTER(regend[this_reg]); \
1378                                                                         \
1379                         DEBUG_PRINT2 ("    info: 0x%p\n      ",         \
1380                                       &reg_info[this_reg]);             \
1381                         DEBUG_PRINT2 (" match_null=%d",                 \
1382                                       REG_MATCH_NULL_STRING_P           \
1383                                       (reg_info[this_reg]));            \
1384                         DEBUG_PRINT2 (" active=%d",                     \
1385                                       IS_ACTIVE (reg_info[this_reg]));  \
1386                         DEBUG_PRINT2 (" matched_something=%d",          \
1387                                       MATCHED_SOMETHING                 \
1388                                       (reg_info[this_reg]));            \
1389                         DEBUG_PRINT2 (" ever_matched_something=%d",     \
1390                                       EVER_MATCHED_SOMETHING            \
1391                                       (reg_info[this_reg]));            \
1392                         DEBUG_PRINT1 ("\n");                            \
1393                         PUSH_FAILURE_ELT(reg_info[this_reg].word);      \
1394                 }                                                       \
1395                                                                         \
1396                 DEBUG_PRINT2 ("  Pushing  low active reg: %d\n",        \
1397                               lowest_active_reg);                       \
1398                 PUSH_FAILURE_INT (lowest_active_reg);                   \
1399                                                                         \
1400                 DEBUG_PRINT2 ("  Pushing high active reg: %d\n",        \
1401                               highest_active_reg);                      \
1402                 PUSH_FAILURE_INT (highest_active_reg);                  \
1403                                                                         \
1404                 DEBUG_PRINT2 ("  Pushing pattern 0x%lx: \n",            \
1405                               (long) pattern_place);                    \
1406                 DEBUG_PRINT_COMPILED_PATTERN(bufp, pattern_place, pend); \
1407                 PUSH_FAILURE_POINTER (pattern_place);                   \
1408                                                                         \
1409                 DEBUG_PRINT2("  Pushing string %p: `",                  \
1410                              string_place);                             \
1411                 DEBUG_PRINT_DOUBLE_STRING(string_place,                 \
1412                                           string1, size1,               \
1413                                           string2, size2);              \
1414                 DEBUG_PRINT1("'\n");                                    \
1415                 PUSH_FAILURE_POINTER (string_place);                    \
1416                                                                         \
1417                 DEBUG_PRINT2("  Pushing failure id: %u\n", failure_id); \
1418                 DEBUG_PUSH(failure_id);                                 \
1419         } while (0)
1420
1421 /* This is the number of items that are pushed and popped on the stack
1422    for each register.  */
1423 #define NUM_REG_ITEMS  3
1424
1425 /* Individual items aside from the registers.  */
1426 #ifdef DEBUG
1427 #define NUM_NONREG_ITEMS 5      /* Includes failure point id.  */
1428 #else
1429 #define NUM_NONREG_ITEMS 4
1430 #endif
1431
1432 /* We push at most this many items on the stack.  */
1433 /* We used to use (num_regs - 1), which is the number of registers
1434    this regexp will save; but that was changed to 5
1435    to avoid stack overflow for a regexp with lots of parens.  */
1436 #define MAX_FAILURE_ITEMS (5 * (NUM_REG_ITEMS + NUM_NONREG_ITEMS))
1437
1438 /* We actually push this many items.  */
1439 #define NUM_FAILURE_ITEMS                                               \
1440   ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
1441     + NUM_NONREG_ITEMS)
1442
1443 /* How many items can still be added to the stack without overflowing it.  */
1444 #define REMAINING_AVAIL_SLOTS ((int) ((fail_stack).size - (fail_stack).avail))
1445
1446 /* Pops what PUSH_FAIL_STACK pushes.
1447
1448    We restore into the parameters, all of which should be lvalues:
1449      STR -- the saved data position.
1450      PAT -- the saved pattern position.
1451      LOW_REG, HIGH_REG -- the highest and lowest active registers.
1452      REGSTART, REGEND -- arrays of string positions.
1453      REG_INFO -- array of information about each subexpression.
1454
1455    Also assumes the variables `fail_stack' and (if debugging), `bufp',
1456    `pend', `string1', `size1', `string2', and `size2'.  */
1457
1458 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg,                  \
1459                           regstart, regend, reg_info)                   \
1460         do {                                                            \
1461                 int this_reg;                                           \
1462                 const unsigned char *string_temp;                       \
1463                 DEBUG_STATEMENT (fail_stack_elt_t ffailure_id);         \
1464                                                                         \
1465                 assert (!FAIL_STACK_EMPTY ());                          \
1466                                                                         \
1467                 /* Remove failure points and point to                   \
1468                  * how many regs pushed.  */                            \
1469                 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n");                  \
1470                 DEBUG_PRINT2 ("  Before pop, next avail: %lu\n",        \
1471                               (unsigned long) fail_stack.avail);        \
1472                 DEBUG_PRINT2 ("                    size: %lu\n",        \
1473                               (unsigned long) fail_stack.size);         \
1474                                                                         \
1475                 assert (fail_stack.avail >= NUM_NONREG_ITEMS);          \
1476                                                                         \
1477                 DEBUG_POP (&ffailure_id.integer);                       \
1478                 DEBUG_PRINT2 ("  Popping failure id: %u\n",             \
1479                               * (unsigned int *) &ffailure_id);         \
1480                                                                         \
1481                 /* If the saved string location is NULL, it             \
1482                    came from an on_failure_keep_string_jump opcode,     \
1483                    and we want to throw away the saved NULL, thus       \
1484                    retaining our current position in the string.  */    \
1485                 string_temp = (const unsigned char *)                   \
1486                         POP_FAILURE_POINTER ();                         \
1487                 if (string_temp != NULL)                                \
1488                         str = string_temp;                              \
1489                                                                         \
1490                 DEBUG_PRINT2("  Popping string %p: `", str);            \
1491                 DEBUG_PRINT_DOUBLE_STRING(str,                          \
1492                                           string1, size1,               \
1493                                           string2, size2);              \
1494                 DEBUG_PRINT1("'\n");                                    \
1495                                                                         \
1496                 pat = (unsigned char *) POP_FAILURE_POINTER();          \
1497                 DEBUG_PRINT2 ("  Popping pattern %p: ", pat);           \
1498                 DEBUG_PRINT_COMPILED_PATTERN(bufp, pat, pend);          \
1499                                                                         \
1500                 /* Restore register info.  */                           \
1501                 high_reg = (unsigned) POP_FAILURE_INT ();               \
1502                 DEBUG_PRINT2 ("  Popping high active reg: %d\n", high_reg); \
1503                                                                         \
1504                 low_reg = (unsigned) POP_FAILURE_INT ();                \
1505                 DEBUG_PRINT2 ("  Popping  low active reg: %d\n", low_reg); \
1506                                                                         \
1507                 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) { \
1508                         DEBUG_PRINT2 ("    Popping reg: %d\n", this_reg); \
1509                                                                         \
1510                         reg_info[this_reg].word = POP_FAILURE_ELT ();   \
1511                         DEBUG_PRINT2 ("      info: %p\n",               \
1512                                       &reg_info[this_reg]);             \
1513                                                                         \
1514                         regend[this_reg] = POP_FAILURE_POINTER ();      \
1515                         DEBUG_PRINT2 ("      end: %p\n",                \
1516                                       regend[this_reg]);                \
1517                                                                         \
1518                         regstart[this_reg] = POP_FAILURE_POINTER ();    \
1519                         DEBUG_PRINT2 ("      start: %p\n",              \
1520                                       regstart[this_reg]);              \
1521                 }                                                       \
1522                                                                         \
1523                 set_regs_matched_done = 0;                              \
1524                 DEBUG_STATEMENT (nfailure_points_popped++);             \
1525         } while (0)                     /* POP_FAILURE_POINT */
1526 \f
1527 /* Structure for per-register (a.k.a. per-group) information.
1528    Other register information, such as the
1529    starting and ending positions (which are addresses), and the list of
1530    inner groups (which is a bits list) are maintained in separate
1531    variables.
1532
1533    We are making a (strictly speaking) nonportable assumption here: that
1534    the compiler will pack our bit fields into something that fits into
1535    the type of `word', i.e., is something that fits into one item on the
1536    failure stack.  */
1537
1538 typedef union {
1539         fail_stack_elt_t word;
1540         struct {
1541                 /* This field is one if this group can match the empty string,
1542                    zero if not.  If not yet determined,  `MATCH_NULL_UNSET_VALUE'.  */
1543 #define MATCH_NULL_UNSET_VALUE 3
1544                 unsigned match_null_string_p:2;
1545                 unsigned is_active:1;
1546                 unsigned matched_something:1;
1547                 unsigned ever_matched_something:1;
1548         } bits;
1549 } register_info_type;
1550
1551 #define REG_MATCH_NULL_STRING_P(R)  ((R).bits.match_null_string_p)
1552 #define IS_ACTIVE(R)  ((R).bits.is_active)
1553 #define MATCHED_SOMETHING(R)  ((R).bits.matched_something)
1554 #define EVER_MATCHED_SOMETHING(R)  ((R).bits.ever_matched_something)
1555
1556 /* Call this when have matched a real character; it sets `matched' flags
1557    for the subexpressions which we are currently inside.  Also records
1558    that those subexprs have matched.  */
1559 #define SET_REGS_MATCHED()                                              \
1560   do                                                                    \
1561     {                                                                   \
1562       if (!set_regs_matched_done)                                       \
1563         {                                                               \
1564           Element_count r;                                              \
1565           set_regs_matched_done = 1;                                    \
1566           for (r = lowest_active_reg; r <= highest_active_reg; r++)     \
1567             {                                                           \
1568               MATCHED_SOMETHING (reg_info[r])                           \
1569                 = EVER_MATCHED_SOMETHING (reg_info[r])                  \
1570                 = 1;                                                    \
1571             }                                                           \
1572         }                                                               \
1573     }                                                                   \
1574   while (0)
1575
1576 /* Registers are set to a sentinel when they haven't yet matched.  */
1577 static unsigned char reg_unset_dummy;
1578 #define REG_UNSET_VALUE (&reg_unset_dummy)
1579 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1580 \f
1581 /* Subroutine declarations and macros for regex_compile.  */
1582
1583 /* Fetch the next character in the uncompiled pattern---translating it
1584    if necessary.  Also cast from a signed character in the constant
1585    string passed to us by the user to an unsigned char that we can use
1586    as an array index (in, e.g., `translate').  */
1587 #define PATFETCH(c)                                                     \
1588   do {                                                                  \
1589     PATFETCH_RAW (c);                                                   \
1590     c = TRANSLATE (c);                                                  \
1591   } while (0)
1592
1593 /* Fetch the next character in the uncompiled pattern, with no
1594    translation.  */
1595 #define PATFETCH_RAW(c)                                                 \
1596   do {if (p == pend) return REG_EEND;                                   \
1597     assert (p < pend);                                                  \
1598     c = charptr_emchar (p);                                             \
1599     INC_CHARPTR (p);                                                    \
1600   } while (0)
1601
1602 /* Go backwards one character in the pattern.  */
1603 #define PATUNFETCH DEC_CHARPTR (p)
1604
1605 #ifdef MULE
1606
1607 #define PATFETCH_EXTENDED(emch)                                         \
1608   do {if (p == pend) return REG_EEND;                                   \
1609     assert (p < pend);                                                  \
1610     emch = charptr_emchar ((const Bufbyte *) p);                        \
1611     INC_CHARPTR (p);                                                    \
1612     if (TRANSLATE_P (translate) && emch < 0x80)                         \
1613       emch = (Emchar) (unsigned char) RE_TRANSLATE (emch);              \
1614   } while (0)
1615
1616 #define PATFETCH_RAW_EXTENDED(emch)                                     \
1617   do {if (p == pend) return REG_EEND;                                   \
1618     assert (p < pend);                                                  \
1619     emch = charptr_emchar ((const Bufbyte *) p);                        \
1620     INC_CHARPTR (p);                                                    \
1621   } while (0)
1622
1623 #define PATUNFETCH_EXTENDED DEC_CHARPTR (p)
1624
1625 #define PATFETCH_EITHER(emch)                   \
1626   do {                                          \
1627     if (has_extended_chars)                     \
1628       PATFETCH_EXTENDED (emch);                 \
1629     else                                        \
1630       PATFETCH (emch);                          \
1631   } while (0)
1632
1633 #define PATFETCH_RAW_EITHER(emch)               \
1634   do {                                          \
1635     if (has_extended_chars)                     \
1636       PATFETCH_RAW_EXTENDED (emch);             \
1637     else                                        \
1638       PATFETCH_RAW (emch);                      \
1639   } while (0)
1640
1641 #define PATUNFETCH_EITHER                       \
1642   do {                                          \
1643     if (has_extended_chars)                     \
1644       PATUNFETCH_EXTENDED (emch);               \
1645     else                                        \
1646       PATUNFETCH (emch);                        \
1647   } while (0)
1648
1649 #else                           /* not MULE */
1650
1651 #define PATFETCH_EITHER(emch) PATFETCH (emch)
1652 #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch)
1653 #define PATUNFETCH_EITHER PATUNFETCH
1654
1655 #endif                          /* MULE */
1656
1657 /* If `translate' is non-null, return translate[D], else just D.  We
1658    cast the subscript to translate because some data is declared as
1659    `char *', to avoid warnings when a string constant is passed.  But
1660    when we use a character as a subscript we must make it unsigned.  */
1661 #define TRANSLATE(d) (TRANSLATE_P (translate) ? RE_TRANSLATE (d) : (d))
1662
1663 /* Macros for outputting the compiled pattern into `buffer'.  */
1664
1665 /* If the buffer isn't allocated when it comes in, use this.  */
1666 #define INIT_BUF_SIZE  32
1667
1668 /* Make sure we have at least N more bytes of space in buffer.  */
1669 #define GET_BUFFER_SPACE(n)                                             \
1670     while (buf_end - bufp->buffer + (n) > (ptrdiff_t) bufp->allocated)  \
1671       EXTEND_BUFFER ()
1672
1673 /* Make sure we have one more byte of buffer space and then add C to it.  */
1674 #define BUF_PUSH(c)                                                     \
1675   do {                                                                  \
1676     GET_BUFFER_SPACE (1);                                               \
1677     *buf_end++ = (unsigned char) (c);                                   \
1678   } while (0)
1679
1680 /* Ensure we have two more bytes of buffer space and then append C1 and C2.  */
1681 #define BUF_PUSH_2(c1, c2)                                              \
1682   do {                                                                  \
1683     GET_BUFFER_SPACE (2);                                               \
1684     *buf_end++ = (unsigned char) (c1);                                  \
1685     *buf_end++ = (unsigned char) (c2);                                  \
1686   } while (0)
1687
1688 /* As with BUF_PUSH_2, except for three bytes.  */
1689 #define BUF_PUSH_3(c1, c2, c3)                                          \
1690   do {                                                                  \
1691     GET_BUFFER_SPACE (3);                                               \
1692     *buf_end++ = (unsigned char) (c1);                                  \
1693     *buf_end++ = (unsigned char) (c2);                                  \
1694     *buf_end++ = (unsigned char) (c3);                                  \
1695   } while (0)
1696
1697 /* Store a jump with opcode OP at LOC to location TO.  We store a
1698    relative address offset by the three bytes the jump itself occupies.  */
1699 #define STORE_JUMP(op, loc, to) \
1700   store_op1 (op, loc, (to) - (loc) - 3)
1701
1702 /* Likewise, for a two-argument jump.  */
1703 #define STORE_JUMP2(op, loc, to, arg) \
1704   store_op2 (op, loc, (to) - (loc) - 3, arg)
1705
1706 /* Like `STORE_JUMP', but for inserting.  Assume `buf_end' is the
1707    buffer end.  */
1708 #define INSERT_JUMP(op, loc, to) \
1709   insert_op1 (op, loc, (to) - (loc) - 3, buf_end)
1710
1711 /* Like `STORE_JUMP2', but for inserting.  Assume `buf_end' is the
1712    buffer end.  */
1713 #define INSERT_JUMP2(op, loc, to, arg) \
1714   insert_op2 (op, loc, (to) - (loc) - 3, arg, buf_end)
1715
1716 /* This is not an arbitrary limit: the arguments which represent offsets
1717    into the pattern are two bytes long.  So if 2^16 bytes turns out to
1718    be too small, many things would have to change.  */
1719 #define MAX_BUF_SIZE (1L << 16)
1720
1721 /* Extend the buffer by twice its current size via realloc and
1722    reset the pointers that pointed into the old block to point to the
1723    correct places in the new one.  If extending the buffer results in it
1724    being larger than MAX_BUF_SIZE, then flag memory exhausted.  */
1725 #define EXTEND_BUFFER()                                                 \
1726         do {                                                            \
1727                 re_char *old_buffer = bufp->buffer;                     \
1728                 if (bufp->allocated == MAX_BUF_SIZE)                    \
1729                         return REG_ESIZE;                               \
1730                 bufp->allocated <<= 1;                                  \
1731                 if (bufp->allocated > MAX_BUF_SIZE)                     \
1732                         bufp->allocated = MAX_BUF_SIZE;                 \
1733                 bufp->buffer = (unsigned char *)xrealloc(               \
1734                         bufp->buffer, bufp->allocated);                 \
1735                 if (bufp->buffer == NULL) {                             \
1736                         return REG_ESPACE;                              \
1737                 }                                                       \
1738                 /* If the buffer moved, move all the pointers into it.  */ \
1739                 if (old_buffer != bufp->buffer) {                       \
1740                         buf_end = (buf_end - old_buffer) + bufp->buffer; \
1741                         begalt = (begalt - old_buffer) + bufp->buffer;  \
1742                         if (fixup_alt_jump) {                           \
1743                                 fixup_alt_jump =                        \
1744                                         (fixup_alt_jump - old_buffer) + \
1745                                         bufp->buffer;                   \
1746                         }                                               \
1747                         if (laststart) {                                \
1748                                 laststart = (laststart - old_buffer) +  \
1749                                         bufp->buffer;                   \
1750                         }                                               \
1751                         if (pending_exact) {                            \
1752                                 pending_exact =                         \
1753                                         (pending_exact - old_buffer) +  \
1754                                         bufp->buffer;                   \
1755                         }                                               \
1756                 }                                                       \
1757         } while (0)
1758
1759 /* Since we have one byte reserved for the register number argument to
1760    {start,stop}_memory, the maximum number of groups we can report
1761    things about is what fits in that byte.  */
1762 #define MAX_REGNUM 255
1763
1764 /* But patterns can have more than `MAX_REGNUM' registers.  We just
1765    ignore the excess.  */
1766 typedef unsigned regnum_t;
1767
1768 #define INIT_REG_TRANSLATE_SIZE 5
1769
1770 /* Macros for the compile stack.  */
1771
1772 /* Since offsets can go either forwards or backwards, this type needs to
1773    be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1.  */
1774 typedef int pattern_offset_t;
1775
1776 typedef struct {
1777         pattern_offset_t begalt_offset;
1778         pattern_offset_t fixup_alt_jump;
1779         pattern_offset_t inner_group_offset;
1780         pattern_offset_t laststart_offset;
1781         regnum_t regnum;
1782 } compile_stack_elt_t;
1783
1784 typedef struct {
1785         compile_stack_elt_t *stack;
1786         unsigned size;
1787         unsigned avail;         /* Offset of next open position.  */
1788 } compile_stack_type;
1789
1790 #define INIT_COMPILE_STACK_SIZE 32
1791
1792 #define COMPILE_STACK_EMPTY  (compile_stack.avail == 0)
1793 #define COMPILE_STACK_FULL  (compile_stack.avail == compile_stack.size)
1794
1795 /* The next available element.  */
1796 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1797
1798 /* Set the bit for character C in a bit vector.  */
1799 #define SET_LIST_BIT(c)                         \
1800   (buf_end[((unsigned char) (c)) / BYTEWIDTH]   \
1801    |= 1 << (((unsigned char) c) % BYTEWIDTH))
1802
1803 #ifdef MULE
1804
1805 /* Set the "bit" for character C in a range table. */
1806 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt)
1807
1808 /* Set the "bit" for character c in the appropriate table. */
1809 #define SET_EITHER_BIT(c)                       \
1810         do {                                    \
1811                 if (has_extended_chars)         \
1812                         SET_RANGETAB_BIT (c);   \
1813                 else                            \
1814                         SET_LIST_BIT (c);       \
1815         } while (0)
1816
1817 #else                           /* not MULE */
1818
1819 #define SET_EITHER_BIT(c) SET_LIST_BIT (c)
1820
1821 #endif
1822
1823 /* Get the next unsigned number in the uncompiled pattern.  */
1824 #define GET_UNSIGNED_NUMBER(num)                                        \
1825   { if (p != pend)                                                      \
1826      {                                                                  \
1827        PATFETCH (c);                                                    \
1828        while (ISDIGIT (c))                                              \
1829          {                                                              \
1830            if (num < 0)                                                 \
1831               num = 0;                                                  \
1832            num = num * 10 + c - '0';                                    \
1833            if (p == pend)                                               \
1834               break;                                                    \
1835            PATFETCH (c);                                                \
1836          }                                                              \
1837        }                                                                \
1838     }
1839
1840 #define CHAR_CLASS_MAX_LENGTH  9        /* Namely, `multibyte'.  */
1841
1842 \f
1843 static void store_op1(re_opcode_t op, unsigned char *loc, int arg);
1844 static void store_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1845 static void insert_op1(re_opcode_t op, unsigned char *loc, int arg,
1846                        unsigned char *end);
1847 static void insert_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2,
1848                        unsigned char *end);
1849 static re_bool at_begline_loc_p(re_char*, re_char*, reg_syntax_t);
1850 static re_bool at_endline_loc_p(re_char*, re_char*, int syntax);
1851 static re_bool group_in_compile_stack(compile_stack_type compile_stack,
1852                                       regnum_t regnum);
1853 static reg_errcode_t compile_range(re_char ** p_ptr, re_char * pend,
1854                                    RE_TRANSLATE_TYPE translate,
1855                                    reg_syntax_t syntax, unsigned char *b);
1856 #ifdef MULE
1857 static reg_errcode_t compile_extended_range(re_char ** p_ptr,
1858                                             re_char * pend,
1859                                             RE_TRANSLATE_TYPE translate,
1860                                             reg_syntax_t syntax,
1861                                             Lisp_Object rtab);
1862 #endif                          /* MULE */
1863 static re_bool group_match_null_string_p(unsigned char **p,
1864                                          unsigned char *end,
1865                                          register_info_type * reg_info);
1866 static re_bool alt_match_null_string_p(unsigned char *p, unsigned char *end,
1867                                        register_info_type * reg_info);
1868 static re_bool common_op_match_null_string_p(unsigned char **p,
1869                                              unsigned char *end,
1870                                              register_info_type * reg_info);
1871 static int bcmp_translate(const unsigned char *s1, const unsigned char *s2,
1872                           REGISTER int len, RE_TRANSLATE_TYPE translate);
1873 static int re_match_2_internal(struct re_pattern_buffer *bufp,
1874                                re_char * string1, int size1,
1875                                re_char * string2, int size2, int pos,
1876                                struct re_registers *regs, int stop);
1877 \f
1878 #ifndef MATCH_MAY_ALLOCATE
1879
1880 /* If we cannot allocate large objects within re_match_2_internal,
1881    we make the fail stack and register vectors global.
1882    The fail stack, we grow to the maximum size when a regexp
1883    is compiled.
1884    The register vectors, we adjust in size each time we
1885    compile a regexp, according to the number of registers it needs.  */
1886
1887 static fail_stack_type fail_stack;
1888
1889 /* Size with which the following vectors are currently allocated.
1890    That is so we can make them bigger as needed,
1891    but never make them smaller.  */
1892 static int regs_allocated_size;
1893
1894 static re_char **regstart, **regend;
1895 static re_char **old_regstart, **old_regend;
1896 static re_char **best_regstart, **best_regend;
1897 static register_info_type *reg_info;
1898 static re_char **reg_dummy;
1899 static register_info_type *reg_info_dummy;
1900
1901 /* Make the register vectors big enough for NUM_REGS registers,
1902    but don't make them smaller.  */
1903
1904 static void regex_grow_registers(int num_regs)
1905 {
1906         if (num_regs > regs_allocated_size) {
1907                 RETALLOC_IF(regstart, num_regs, re_char *);
1908                 RETALLOC_IF(regend, num_regs, re_char *);
1909                 RETALLOC_IF(old_regstart, num_regs, re_char *);
1910                 RETALLOC_IF(old_regend, num_regs, re_char *);
1911                 RETALLOC_IF(best_regstart, num_regs, re_char *);
1912                 RETALLOC_IF(best_regend, num_regs, re_char *);
1913                 RETALLOC_IF(reg_info, num_regs, register_info_type);
1914                 RETALLOC_IF(reg_dummy, num_regs, re_char *);
1915                 RETALLOC_IF(reg_info_dummy, num_regs, register_info_type);
1916
1917                 regs_allocated_size = num_regs;
1918         }
1919 }
1920
1921 #endif                          /* not MATCH_MAY_ALLOCATE */
1922
1923 /* auxiliary stuff, FSF theft */
1924 /* Character classes.  */
1925 typedef enum {
1926         RECC_ERROR = 0,
1927         RECC_ALNUM, RECC_ALPHA, RECC_WORD,
1928         RECC_GRAPH, RECC_PRINT,
1929         RECC_LOWER, RECC_UPPER,
1930         RECC_PUNCT, RECC_CNTRL,
1931         RECC_DIGIT, RECC_XDIGIT,
1932         RECC_BLANK, RECC_SPACE,
1933         RECC_MULTIBYTE, RECC_NONASCII,
1934         RECC_ASCII, RECC_UNIBYTE
1935 } re_wctype_t;
1936
1937 /* Map a string to the char class it names (if any).  */
1938 static inline re_wctype_t
1939 re_wctype(re_char *str)
1940 {
1941         const char *string = (const char*)str;
1942
1943         if (STREQ (string, "alnum")) {
1944                 return RECC_ALNUM;
1945         } else if (STREQ (string, "alpha")) {
1946                 return RECC_ALPHA;
1947         } else if (STREQ (string, "digit")) {
1948                 return RECC_DIGIT;
1949         } else if (STREQ (string, "graph")) {
1950                 return RECC_GRAPH;
1951         } else if (STREQ (string, "space")) {
1952                 return RECC_SPACE;
1953         } else if (STREQ (string, "upper")) {
1954                 return RECC_UPPER;
1955         } else if (STREQ (string, "lower")) {
1956                 return RECC_LOWER;
1957         } else if (STREQ (string, "word")) {
1958                 return RECC_WORD;
1959         } else if (STREQ (string, "print")) {
1960                 return RECC_PRINT;
1961         } else if (STREQ (string, "punct")) {
1962                 return RECC_PUNCT;
1963         } else if (STREQ (string, "xdigit")) {
1964                 return RECC_XDIGIT;
1965         } else if (STREQ (string, "cntrl")) {
1966                 return RECC_CNTRL;
1967         } else if (STREQ (string, "blank")) {
1968                 return RECC_BLANK;
1969         } else if (STREQ (string, "ascii")) {
1970                 return RECC_ASCII;
1971         } else if (STREQ (string, "nonascii")) {
1972                 return RECC_NONASCII;
1973         } else if (STREQ (string, "unibyte")) {
1974                 return RECC_UNIBYTE;
1975         } else if (STREQ (string, "multibyte")) {
1976                 return RECC_MULTIBYTE;
1977         } else {
1978                 return RECC_ERROR;
1979         }
1980 }
1981
1982 /* True if CH is in the char class CC.  */
1983 static inline bool
1984 re_iswctype(int ch, re_wctype_t cc)
1985 {
1986         switch (cc) {
1987         case RECC_ALNUM:
1988                 return ISALNUM (ch);
1989         case RECC_ALPHA:
1990                 return ISALPHA (ch);
1991         case RECC_BLANK:
1992                 return ISBLANK (ch);
1993         case RECC_CNTRL:
1994                 return ISCNTRL (ch);
1995         case RECC_DIGIT:
1996                 return ISDIGIT (ch);
1997         case RECC_GRAPH:
1998                 return ISGRAPH (ch);
1999         case RECC_LOWER:
2000                 return ISLOWER (ch);
2001         case RECC_PRINT:
2002                 return ISPRINT (ch);
2003         case RECC_PUNCT:
2004                 return ISPUNCT (ch);
2005         case RECC_SPACE:
2006                 return ISSPACE (ch);
2007         case RECC_UPPER:
2008                 return ISUPPER (ch);
2009         case RECC_XDIGIT:
2010                 return ISXDIGIT (ch);
2011         case RECC_ASCII:
2012                 return IS_REAL_ASCII (ch);
2013         case RECC_NONASCII:
2014                 return !IS_REAL_ASCII (ch);
2015         case RECC_WORD:
2016                 return ISWORD(ch);
2017         case RECC_UNIBYTE:
2018                 return ISUNIBYTE((unsigned int)ch);
2019         case RECC_MULTIBYTE:
2020                 return !ISUNIBYTE((unsigned int)ch);
2021         case RECC_ERROR:
2022                 return false;
2023         default:
2024                 abort();
2025         }
2026         return false;
2027 }
2028
2029 \f
2030 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2031    Returns one of error codes defined in `regex.h', or zero for success.
2032
2033    Assumes the `allocated' (and perhaps `buffer') and `translate'
2034    fields are set in BUFP on entry.
2035
2036    If it succeeds, results are put in BUFP (if it returns an error, the
2037    contents of BUFP are undefined):
2038      `buffer' is the compiled pattern;
2039      `syntax' is set to SYNTAX;
2040      `used' is set to the length of the compiled pattern;
2041      `fastmap_accurate' is zero;
2042      `re_ngroups' is the number of groups/subexpressions (including shy
2043         groups) in PATTERN;
2044      `re_nsub' is the number of non-shy groups in PATTERN;
2045      `not_bol' and `not_eol' are zero;
2046
2047    The `fastmap' and `newline_anchor' fields are neither
2048    examined nor set.  */
2049
2050 static reg_errcode_t
2051 regex_compile(re_char * pattern, int size, reg_syntax_t syntax,
2052               struct re_pattern_buffer *bufp)
2053 {
2054         /* We fetch characters from PATTERN here.  We declare these as int
2055            (or possibly long) so that chars above 127 can be used as
2056            array indices.  The macros that fetch a character from the pattern
2057            make sure to coerce to unsigned char before assigning, so we won't
2058            get bitten by negative numbers here. */
2059         /* XEmacs change: used to be unsigned char. */
2060         REGISTER EMACS_INT c, c1;
2061
2062         /* A random temporary spot in PATTERN.  */
2063         re_char *p1;
2064
2065         /* Points to the end of the buffer, where we should append.  */
2066         REGISTER unsigned char *buf_end;
2067
2068         /* Keeps track of unclosed groups.  */
2069         compile_stack_type compile_stack;
2070
2071         /* Points to the current (ending) position in the pattern.  */
2072         re_char *p = pattern;
2073         re_char *pend = pattern + size;
2074
2075         /* How to translate the characters in the pattern.  */
2076         RE_TRANSLATE_TYPE translate = bufp->translate;
2077
2078         /* Address of the count-byte of the most recently inserted `exactn'
2079            command.  This makes it possible to tell if a new exact-match
2080            character can be added to that command or if the character requires
2081            a new `exactn' command.  */
2082         unsigned char *pending_exact = 0;
2083
2084         /* Address of start of the most recently finished expression.
2085            This tells, e.g., postfix * where to find the start of its
2086            operand.  Reset at the beginning of groups and alternatives.  */
2087         unsigned char *laststart = 0;
2088
2089         /* Address of beginning of regexp, or inside of last group.  */
2090         unsigned char *begalt;
2091
2092         /* Place in the uncompiled pattern (i.e., the {) to
2093            which to go back if the interval is invalid.  */
2094         re_char *beg_interval;
2095
2096         /* Address of the place where a forward jump should go to the end of
2097            the containing expression.  Each alternative of an `or' -- except the
2098            last -- ends with a forward jump of this sort.  */
2099         unsigned char *fixup_alt_jump = 0;
2100
2101         /* Counts open-groups as they are encountered.  Remembered for the
2102            matching close-group on the compile stack, so the same register
2103            number is put in the stop_memory as the start_memory.  */
2104         regnum_t regnum = 0;
2105
2106         /* we need to close over compile_stack */
2107 #       define free_stack(args...)      xfree(compile_stack.stack)
2108
2109 #ifdef REGEX_DEBUG_FLAG
2110         DEBUG_PRINT1("\nCompiling pattern: ");
2111         if (debug) {
2112                 int debug_count;
2113
2114                 for (debug_count = 0; debug_count < size; debug_count++)
2115                         putchar(pattern[debug_count]);
2116                 putchar('\n');
2117         }
2118 #endif                          /* DEBUG */
2119
2120         /* Initialize the compile stack.  */
2121         compile_stack.stack =
2122                 TALLOC(INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2123         if (compile_stack.stack == NULL) {
2124                 return REG_ESPACE;
2125         }
2126         compile_stack.size = INIT_COMPILE_STACK_SIZE;
2127         compile_stack.avail = 0;
2128
2129         /* Initialize the pattern buffer.  */
2130         bufp->syntax = syntax;
2131         bufp->fastmap_accurate = 0;
2132         bufp->not_bol = bufp->not_eol = 0;
2133
2134         /* Set `used' to zero, so that if we return an error, the pattern
2135            printer (for debugging) will think there's no pattern.  We reset it
2136            at the end.  */
2137         bufp->used = 0;
2138
2139         /* Always count groups, whether or not bufp->no_sub is set.  */
2140         bufp->re_nsub = 0;
2141         bufp->re_ngroups = 0;
2142
2143         /* Allocate index translation array if needed. */
2144         if (bufp->external_to_internal_register == 0) {
2145                 bufp->external_to_internal_register_size =
2146                     INIT_REG_TRANSLATE_SIZE;
2147                 RETALLOC(bufp->external_to_internal_register,
2148                          bufp->external_to_internal_register_size, int);
2149         }
2150         /* Initialize translations to impossible value to aid debugging. */
2151         {
2152                 int i;
2153
2154                 bufp->external_to_internal_register[0] = 0;
2155                 for (i = 1; i < bufp->external_to_internal_register_size; i++)
2156                         bufp->external_to_internal_register[i] =
2157                             (int)0xDEADBEEF;
2158         }
2159
2160 #if !defined (emacs) && !defined (SYNTAX_TABLE)
2161         /* Initialize the syntax table.  */
2162         init_syntax_once();
2163 #endif
2164
2165         if (bufp->allocated == 0) {
2166                 if (bufp->buffer) {
2167                         /* If zero allocated, but buffer is non-null, try to
2168                            realloc enough space.  This loses if buffer's address
2169                            is bogus, but that is the user's responsibility.  */
2170                         RETALLOC(bufp->buffer, INIT_BUF_SIZE, unsigned char);
2171                 } else {
2172                         /* Caller did not allocate a buffer.  Do it for them. */
2173                         bufp->buffer = TALLOC(INIT_BUF_SIZE, unsigned char);
2174                 }
2175                 if (!bufp->buffer) {
2176                         free_stack();
2177                         return REG_ESPACE;
2178                 }
2179
2180                 bufp->allocated = INIT_BUF_SIZE;
2181         }
2182
2183         begalt = buf_end = bufp->buffer;
2184
2185         /* Loop through the uncompiled pattern until we're at the end.  */
2186         while (p != pend) {
2187                 PATFETCH(c);
2188
2189                 switch (c) {
2190                 case '^': {
2191                         if (    /* If at start of pattern, it's an operator.  */
2192                                 p == pattern + 1
2193                                 /* If context independent, it's an operator.  */
2194                                 || syntax & RE_CONTEXT_INDEP_ANCHORS
2195                                 /* Otherwise, depends on what's come before.  */
2196                                 || at_begline_loc_p(pattern, p, syntax))
2197                                 BUF_PUSH(begline);
2198                         else
2199                                 goto normal_char;
2200                 }
2201                         break;
2202
2203                 case '$': {
2204                         if (    /* If at end of pattern, it's an
2205                                    operator.  */
2206                                 p == pend
2207                                 /* If context independent, it's an
2208                                    operator.  */
2209                                 || syntax & RE_CONTEXT_INDEP_ANCHORS
2210                                 /* Otherwise, depends on what's
2211                                    next.  */
2212                                 || at_endline_loc_p(p, pend, syntax))
2213                                 BUF_PUSH(endline);
2214                         else
2215                                 goto normal_char;
2216                 }
2217                         break;
2218
2219                 case '+':
2220                 case '?':
2221                         if ((syntax & RE_BK_PLUS_QM) ||
2222                             (syntax & RE_LIMITED_OPS)) {
2223                                 goto normal_char;
2224                         }
2225                         /* pardon? */
2226                 handle_plus:
2227                 case '*': {
2228                         re_bool zero_times_ok;
2229                         re_bool many_times_ok;
2230                         re_bool minimal;
2231
2232                         /* If there is no previous pattern... */
2233                         if (!laststart) {
2234                                 if (syntax & RE_CONTEXT_INVALID_OPS) {
2235                                         free_stack();
2236                                         return REG_BADRPT;
2237                                 } else if (!(syntax & RE_CONTEXT_INDEP_OPS)) {
2238                                         goto normal_char;
2239                                 }
2240                         }
2241
2242                         /* true means zero/many matches are allowed. */
2243                         zero_times_ok = c != '+';
2244                         many_times_ok = c != '?';
2245
2246                         /* true means match shortest string possible. */
2247                         minimal = false;
2248
2249                         /* If there is a sequence of repetition chars, collapse it
2250                            down to just one (the right one).  We can't combine
2251                            interval operators with these because of, e.g., `a{2}*',
2252                            which should only match an even number of `a's.  */
2253                         while (p != pend) {
2254                                 PATFETCH(c);
2255
2256                                 if (c == '*'
2257                                     || (!(syntax & RE_BK_PLUS_QM)
2258                                         && (c == '+' || c == '?'))) ;
2259
2260                                 else if (syntax & RE_BK_PLUS_QM
2261                                          && c == '\\') {
2262                                         if (p == pend) {
2263                                                 free_stack();
2264                                                 return REG_EESCAPE;
2265                                         }
2266
2267                                         PATFETCH(c1);
2268                                         if (!(c1 == '+' || c1 == '?')) {
2269                                                 PATUNFETCH;
2270                                                 PATUNFETCH;
2271                                                 break;
2272                                         }
2273
2274                                         c = c1;
2275                                 } else {
2276                                         PATUNFETCH;
2277                                         break;
2278                                 }
2279
2280                                 /* If we get here, we found another repeat
2281                                    character.  */
2282                                 if (!(syntax & RE_NO_MINIMAL_MATCHING)) {
2283                                         /* "*?" and "+?" and "??" are okay (and
2284                                            mean match minimally), but other
2285                                            sequences (such as "*??" and "+++")
2286                                            are rejected (reserved for future
2287                                            use). */
2288                                         if (minimal || c != '?') {
2289                                                 free_stack();
2290                                                 return REG_BADRPT;
2291                                         }
2292                                         minimal = true;
2293                                 } else {
2294                                         zero_times_ok |= c != '+';
2295                                         many_times_ok |= c != '?';
2296                                 }
2297                         }
2298
2299                         /* Star, etc. applied to an empty pattern is equivalent
2300                            to an empty pattern.  */
2301                         if (!laststart) {
2302                                 break;
2303                         }
2304
2305                         /* Now we know whether zero matches is allowed
2306                            and whether two or more matches is allowed
2307                            and whether we want minimal or maximal matching. */
2308                         if (minimal) {
2309                                 if (!many_times_ok) {
2310                                         /* "a??" becomes:
2311                                            0: /on_failure_jump to 6
2312                                            3: /jump to 9
2313                                            6: /exactn/1/A
2314                                            9: end of pattern.
2315                                         */
2316                                         GET_BUFFER_SPACE(6);
2317                                         INSERT_JUMP(jump, laststart,
2318                                                     buf_end + 3);
2319                                         buf_end += 3;
2320                                         INSERT_JUMP(on_failure_jump,
2321                                                     laststart,
2322                                                     laststart + 6);
2323                                         buf_end += 3;
2324                                 } else if (zero_times_ok) {
2325                                         /* "a*?" becomes:
2326                                            0: /jump to 6
2327                                            3: /exactn/1/A
2328                                            6: /on_failure_jump to 3
2329                                            9: end of pattern.
2330                                         */
2331                                         GET_BUFFER_SPACE(6);
2332                                         INSERT_JUMP(jump, laststart,
2333                                                     buf_end + 3);
2334                                         buf_end += 3;
2335                                         STORE_JUMP(on_failure_jump,
2336                                                    buf_end,
2337                                                    laststart + 3);
2338                                         buf_end += 3;
2339                                 } else {
2340                                         /* "a+?" becomes:
2341                                            0: /exactn/1/A
2342                                            3: /on_failure_jump to 0
2343                                            6: end of pattern.
2344                                         */
2345                                         GET_BUFFER_SPACE(3);
2346                                         STORE_JUMP(on_failure_jump,
2347                                                    buf_end, laststart);
2348                                         buf_end += 3;
2349                                 }
2350                         } else {
2351                                 /* Are we optimizing this jump?  */
2352                                 re_bool keep_string_p = false;
2353
2354                                 if (many_times_ok) {
2355                                         /* More than one repetition is allowed,
2356                                            so put in at the end a backward
2357                                            relative jump from `buf_end' to
2358                                            before the next jump we're going to
2359                                            put in below (which jumps from
2360                                            laststart to after this jump).
2361
2362                                            But if we are at the `*' in the exact
2363                                            sequence `.*\n', insert an
2364                                            unconditional jump backwards to the
2365                                            ., instead of the beginning of the
2366                                            loop.  This way we only push a
2367                                            failure point once, instead of every
2368                                            time through the loop.  */
2369                                         assert(p - 1 > pattern);
2370
2371                                         /* Allocate the space for the jump.  */
2372                                         GET_BUFFER_SPACE(3);
2373
2374                                         /* We know we are not at the first
2375                                            character of the pattern, because
2376                                            laststart was nonzero.  And we've
2377                                            already incremented `p', by the way,
2378                                            to be the character after the `*'.
2379                                            Do we have to do something analogous
2380                                            here for null bytes, because of
2381                                            RE_DOT_NOT_NULL? */
2382                                         if (*(p - 2) == '.' &&
2383                                             zero_times_ok &&
2384                                             p < pend &&
2385                                             *p == '\n' &&
2386                                             !(syntax & RE_DOT_NEWLINE)) {
2387                                                 /* We have .*\n.  */
2388                                                 STORE_JUMP(jump,
2389                                                            buf_end,
2390                                                            laststart);
2391                                                 keep_string_p = true;
2392                                         } else {
2393                                                 /* Anything else.  */
2394                                                 STORE_JUMP
2395                                                         (maybe_pop_jump,
2396                                                          buf_end,
2397                                                          laststart - 3);
2398                                         }
2399                                         /* We've added more stuff to the
2400                                            buffer.  */
2401                                         buf_end += 3;
2402                                 }
2403
2404                                 /* On failure, jump from laststart to buf_end +
2405                                    3, which will be the end of the buffer after
2406                                    this jump is inserted.  */
2407                                 GET_BUFFER_SPACE(3);
2408                                 INSERT_JUMP(keep_string_p ?
2409                                             on_failure_keep_string_jump
2410                                             : on_failure_jump,
2411                                             laststart, buf_end + 3);
2412                                 buf_end += 3;
2413
2414                                 if (!zero_times_ok) {
2415                                         /* At least one repetition is required,
2416                                            so insert a `dummy_failure_jump'
2417                                            before the initial `on_failure_jump'
2418                                            instruction of the loop. This effects
2419                                            a skip over that instruction the
2420                                            first time we hit that loop.  */
2421                                         GET_BUFFER_SPACE(3);
2422                                         INSERT_JUMP(dummy_failure_jump,
2423                                                     laststart,
2424                                                     laststart + 6);
2425                                         buf_end += 3;
2426                                 }
2427                         }
2428                         pending_exact = 0;
2429                 }
2430                         break;
2431
2432                 case '.':
2433                         laststart = buf_end;
2434                         BUF_PUSH(anychar);
2435                         break;
2436
2437                 case '[': {
2438                         /* XEmacs change: this whole section */
2439                         re_bool had_char_class = false;
2440 #ifdef MULE
2441                         re_bool has_extended_chars = false;
2442                         REGISTER Lisp_Object rtab = Qnil;
2443 #endif
2444
2445                         if (p == pend) {
2446                                 free_stack();
2447                                 return REG_EBRACK;
2448                         }
2449
2450                         /* Ensure that we have enough space to push a charset:
2451                            the opcode, the length count, and the bitset; 34
2452                            bytes in all.  */
2453                         GET_BUFFER_SPACE(34);
2454
2455                         laststart = buf_end;
2456
2457                         /* We test `*p == '^' twice, instead of using an if
2458                            statement, so we only need one BUF_PUSH.  */
2459                         BUF_PUSH(*p == '^' ? charset_not : charset);
2460                         if (*p == '^')
2461                                 p++;
2462
2463                         /* Remember the first position in the bracket
2464                            expression.  */
2465                         p1 = p;
2466
2467                         /* Push the number of bytes in the bitmap.  */
2468                         BUF_PUSH((1 << BYTEWIDTH) / BYTEWIDTH);
2469
2470                         /* Clear the whole map.  */
2471                         memset(buf_end, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2472
2473                         /* charset_not matches newline according to a syntax
2474                            bit.  */
2475                         if ((re_opcode_t) buf_end[-2] == charset_not
2476                             && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2477                                 SET_LIST_BIT('\n');
2478
2479 #ifdef MULE
2480                         start_over_with_extended:
2481                         if (has_extended_chars) {
2482                                 /* There are extended chars here, which means we
2483                                    need to start over and shift to unified
2484                                    range-table format. */
2485                                 if (buf_end[-2] == charset)
2486                                         buf_end[-2] = charset_mule;
2487                                 else
2488                                         buf_end[-2] = charset_mule_not;
2489                                 buf_end--;
2490                                 p = p1;
2491                                 /* go back to the beginning of the charset,
2492                                    after a possible ^. */
2493                                 rtab = Vthe_lisp_rangetab;
2494                                 Fclear_range_table(rtab);
2495
2496                                 /* charset_not matches newline according to a
2497                                    syntax bit.  */
2498                                 if ((re_opcode_t) buf_end[-1] ==
2499                                     charset_mule_not
2500                                     && (syntax &
2501                                         RE_HAT_LISTS_NOT_NEWLINE))
2502                                         SET_EITHER_BIT('\n');
2503                         }
2504 #endif                          /* MULE */
2505
2506                                 /* Read in characters and ranges, setting map bits.  */
2507                         for (;;) {
2508                                 if (p == pend) {
2509                                         free_stack();
2510                                         return REG_EBRACK;
2511                                 }
2512                                 PATFETCH(c);
2513
2514 #ifdef MULE
2515                                 if (c >= 0x80 && !has_extended_chars) {
2516                                         has_extended_chars = 1;
2517                                         /* Frumble-bumble, we've found some
2518                                            extended chars.  Need to start over,
2519                                            process everything using the general
2520                                            extended-char mechanism, and need to
2521                                            use charset_mule and charset_mule_not
2522                                            instead of charset and
2523                                            charset_not. */
2524                                         goto start_over_with_extended;
2525                                 }
2526 #endif                          /* MULE */
2527                                 /* \ might escape characters inside [...] and
2528                                    [^...].  */
2529                                 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) &&
2530                                     c == '\\') {
2531                                         if (p == pend) {
2532                                                 free_stack();
2533                                                 return REG_EESCAPE;
2534                                         }
2535
2536                                         PATFETCH(c1);
2537 #ifdef MULE
2538                                         if (c1 >= 0x80
2539                                             && !has_extended_chars) {
2540                                                 has_extended_chars = 1;
2541                                                 goto start_over_with_extended;
2542                                         }
2543 #endif                          /* MULE */
2544                                         SET_EITHER_BIT(c1);
2545                                         continue;
2546                                 }
2547
2548                                 /* Could be the end of the bracket
2549                                    expression.  If it's not (i.e., when
2550                                    the bracket expression is `[]' so
2551                                    far), the ']' character bit gets set
2552                                    way below.  */
2553                                 if (c == ']' && p != p1 + 1)
2554                                         break;
2555
2556                                 /* Look ahead to see if it's a range
2557                                    when the last thing was a character
2558                                    class.  */
2559                                 if (had_char_class && c == '-'
2560                                     && *p != ']') {
2561                                         free_stack();
2562                                         return REG_ERANGE;
2563                                 }
2564
2565                                 /* Look ahead to see if it's a range
2566                                    when the last thing was a character:
2567                                    if this is a hyphen not at the
2568                                    beginning or the end of a list, then
2569                                    it's the range operator.  */
2570                                 if (c == '-'
2571                                     && !(p - 2 >= pattern
2572                                          && p[-2] == '[')
2573                                     && !(p - 3 >= pattern
2574                                          && p[-3] == '['
2575                                          && p[-2] == '^')
2576                                     && *p != ']') {
2577                                         reg_errcode_t ret;
2578
2579 #ifdef MULE
2580                                         if (*(const unsigned char *)p >= 0x80
2581                                             && !has_extended_chars) {
2582                                                 has_extended_chars = 1;
2583                                                 goto start_over_with_extended;
2584                                         }
2585                                         if (has_extended_chars)
2586                                                 ret = compile_extended_range
2587                                                         (&p, pend, translate,
2588                                                          syntax, rtab);
2589                                         else
2590 #endif                          /* MULE */
2591                                                 ret = compile_range
2592                                                         (&p, pend, translate,
2593                                                          syntax, buf_end);
2594                                         if (ret != REG_NOERROR) {
2595                                                 free_stack();
2596                                                 return ret;
2597                                         }
2598                                 }
2599
2600                                 else if (p[0] == '-' && p[1] != ']') {
2601                                         /* This handles ranges made up of
2602                                            characters only.  */
2603                                         reg_errcode_t ret;
2604
2605                                         /* Move past the `-'.  */
2606                                         PATFETCH(c1);
2607
2608 #ifdef MULE
2609                                         if (*(const unsigned char*)p >= 0x80
2610                                             && !has_extended_chars) {
2611                                                 has_extended_chars = 1;
2612                                                 goto start_over_with_extended;
2613                                         }
2614                                         if (has_extended_chars)
2615                                                 ret = compile_extended_range(
2616                                                         &p, pend, translate,
2617                                                         syntax, rtab);
2618                                         else
2619 #endif                          /* MULE */
2620                                                 ret = compile_range(
2621                                                         &p, pend, translate,
2622                                                         syntax, buf_end);
2623                                         if (ret != REG_NOERROR) {
2624                                                 free_stack();
2625                                                 return ret;
2626                                         }
2627                                 }
2628
2629                                 /* See if we're at the beginning of a possible
2630                                    character class.  */
2631
2632                                 else if (syntax & RE_CHAR_CLASSES &&
2633                                          c == '[' && *p == ':') {
2634                                         /* Leave room for the null.  */
2635                                         char str[CHAR_CLASS_MAX_LENGTH + 1];
2636
2637                                         PATFETCH(c);
2638                                         c1 = 0;
2639
2640                                         /* If pattern is `[[:'.  */
2641                                         if (p == pend) {
2642                                                 free_stack();
2643                                                 return REG_EBRACK;
2644                                         }
2645                                         for (;;) {
2646                                                 /* #### This code is unused.
2647                                                    Correctness is not checked
2648                                                    after TRT table change.  */
2649                                                 PATFETCH(c);
2650                                                 if (c == ':' || c == ']'
2651                                                     || p == pend
2652                                                     || c1 ==
2653                                                     CHAR_CLASS_MAX_LENGTH)
2654                                                         break;
2655                                                 str[c1++] = (char)c;
2656                                         }
2657                                         str[c1] = '\0';
2658
2659                                         /* If isn't a word bracketed by
2660                                            `[:' and `:]': undo the
2661                                            ending character, the
2662                                            letters, and leave the
2663                                            leading `:' and `[' (but set
2664                                            bits for them).  */
2665                                         if (c == ':' && *p == ']') {
2666                                                 re_wctype_t wct =
2667                                                         re_wctype(
2668                                                                 (unsigned char*)
2669                                                                 str);
2670
2671                                                 if (wct == RECC_ERROR) {
2672                                                         free_stack();
2673                                                         return REG_ECTYPE;
2674                                                 }
2675
2676                                                 /* Throw away the ] at
2677                                                    the end of the
2678                                                    character class.  */
2679                                                 PATFETCH(c);
2680
2681                                                 if (p == pend) {
2682                                                         free_stack();
2683                                                         return REG_EBRACK;
2684                                                 }
2685
2686                                                 for (int ch = 0;
2687                                                      ch < 1<<BYTEWIDTH;
2688                                                      ch++) {
2689                                                         /* rmsmacs has
2690                                                            this */
2691 #if 0
2692                                                         int translated =
2693                                                                 TRANSLATE(ch);
2694                                                         if (translated <
2695                                                             (1 << BYTEWIDTH) &&
2696                                                             re_iswctype(ch, wct)) {
2697                                                                 SET_EITHER_BIT(ch);
2698                                                         }
2699 #else
2700                                                         if (re_iswctype(ch, wct)) {
2701                                                                 SET_EITHER_BIT(ch);
2702                                                         }
2703 #endif
2704                                                 }
2705                                                 had_char_class = true;
2706                                         } else {
2707                                                 c1++;
2708                                                 while (c1--) {
2709                                                         PATUNFETCH;
2710                                                 }
2711                                                 SET_EITHER_BIT('[');
2712                                                 SET_EITHER_BIT(':');
2713                                                 had_char_class = false;
2714                                         }
2715                                 } else {
2716                                         had_char_class = false;
2717                                         SET_EITHER_BIT(c);
2718                                 }
2719                         }
2720
2721 #ifdef MULE
2722                         if (has_extended_chars) {
2723                                 /* We have a range table, not a bit vector. */
2724                                 int bytes_needed =
2725                                         unified_range_table_bytes_needed
2726                                         (rtab);
2727                                 GET_BUFFER_SPACE(bytes_needed);
2728                                 unified_range_table_copy_data(rtab,
2729                                                               buf_end);
2730                                 buf_end +=
2731                                         unified_range_table_bytes_used
2732                                         (buf_end);
2733                                 break;
2734                         }
2735 #endif                          /* MULE */
2736                                 /* Discard any (non)matching list bytes that are
2737                                    all 0 at the end of the map.  Decrease the
2738                                    map-length byte too.  */
2739                         while ((int)buf_end[-1] > 0
2740                                && buf_end[buf_end[-1] - 1] == 0)
2741                                 buf_end[-1]--;
2742                         buf_end += buf_end[-1];
2743                 }
2744                         break;
2745
2746                 case '(':
2747                         if (syntax & RE_NO_BK_PARENS)
2748                                 goto handle_open;
2749                         else
2750                                 goto normal_char;
2751
2752                 case ')':
2753                         if (syntax & RE_NO_BK_PARENS)
2754                                 goto handle_close;
2755                         else
2756                                 goto normal_char;
2757
2758                 case '\n':
2759                         if (syntax & RE_NEWLINE_ALT)
2760                                 goto handle_alt;
2761                         else
2762                                 goto normal_char;
2763
2764                 case '|':
2765                         if (syntax & RE_NO_BK_VBAR)
2766                                 goto handle_alt;
2767                         else
2768                                 goto normal_char;
2769
2770                 case '{':
2771                         if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2772                                 goto handle_interval;
2773                         else
2774                                 goto normal_char;
2775
2776                 case '\\':
2777                         if (p == pend) {
2778                                 free_stack();
2779                                 return REG_EESCAPE;
2780                         }
2781
2782                         /* Do not translate the character after the \, so that we can
2783                            distinguish, e.g., \B from \b, even if we normally would
2784                            translate, e.g., B to b.  */
2785                         PATFETCH_RAW(c);
2786
2787                         switch (c) {
2788                         case '(':
2789                                 if (syntax & RE_NO_BK_PARENS)
2790                                         goto normal_backslash;
2791
2792                               handle_open:
2793                                 {
2794                                         regnum_t r;
2795                                         int shy = 0;
2796
2797                                         if (!(syntax & RE_NO_SHY_GROUPS)
2798                                             && p != pend && *p == '?') {
2799                                                 p++;
2800                                                 PATFETCH(c);
2801                                                 switch (c) {
2802                                                 case ':':
2803                                                         /* shy groups */
2804                                                         shy = 1;
2805                                                         break;
2806
2807                                                         /* All others are
2808                                                            reserved for future
2809                                                            constructs. */
2810                                                 default:
2811                                                         free_stack();
2812                                                         return REG_BADPAT;
2813                                                 }
2814                                         }
2815
2816                                         r = ++regnum;
2817                                         bufp->re_ngroups++;
2818                                         if (!shy) {
2819                                                 /* Record the translation from capturing group index to
2820                                                    register number, reallocating table as needed. */
2821                                                 bufp->re_nsub++;
2822                                                 while (bufp->
2823                                                        external_to_internal_register_size
2824                                                        <= bufp->re_nsub) {
2825                                                         int i;
2826                                                         int old_size =
2827                                                             bufp->
2828                                                             external_to_internal_register_size;
2829                                                         bufp->
2830                                                             external_to_internal_register_size
2831                                                             += 5;
2832                                                         RETALLOC(bufp->
2833                                                                  external_to_internal_register,
2834                                                                  bufp->
2835                                                                  external_to_internal_register_size,
2836                                                                  int);
2837                                                         /* debugging */
2838                                                         for (i = old_size;
2839                                                              i <
2840                                                              bufp->
2841                                                              external_to_internal_register_size;
2842                                                              i++)
2843                                                                 bufp->
2844                                                                     external_to_internal_register
2845                                                                     [i] =
2846                                                                     (int)
2847                                                                     0xDEADBEEF;
2848                                                 }
2849
2850                                                 bufp->
2851                                                     external_to_internal_register
2852                                                     [bufp->re_nsub] =
2853                                                     bufp->re_ngroups;
2854                                         }
2855
2856                                         if (COMPILE_STACK_FULL) {
2857                                                 RETALLOC(compile_stack.stack,
2858                                                          compile_stack.
2859                                                          size << 1,
2860                                                          compile_stack_elt_t);
2861                                                 if (compile_stack.stack == NULL)
2862                                                         return REG_ESPACE;
2863
2864                                                 compile_stack.size <<= 1;
2865                                         }
2866
2867                                         /* These are the values to restore when
2868                                            we hit end of this group.  They are
2869                                            all relative offsets, so that if the
2870                                            whole pattern moves because of
2871                                            realloc, they will still be
2872                                            valid.  */
2873                                         COMPILE_STACK_TOP.begalt_offset =
2874                                                 begalt - bufp->buffer;
2875                                         COMPILE_STACK_TOP.fixup_alt_jump =
2876                                                 fixup_alt_jump
2877                                                 ? fixup_alt_jump -
2878                                                 bufp->buffer + 1
2879                                                 : 0;
2880                                         COMPILE_STACK_TOP.laststart_offset =
2881                                                 buf_end - bufp->buffer;
2882                                         COMPILE_STACK_TOP.regnum = r;
2883
2884                                         /* We will eventually replace the 0 with
2885                                            the number of groups inner to this
2886                                            one.  But do not push a start_memory
2887                                            for groups beyond the last one we can
2888                                            represent in the compiled pattern.
2889                                            #### bad bad bad.  this will fail in
2890                                            lots of ways, if we ever have to
2891                                            backtrack for these groups.
2892                                          */
2893                                         if (r <= MAX_REGNUM) {
2894                                                 COMPILE_STACK_TOP.
2895                                                         inner_group_offset =
2896                                                         buf_end - bufp->buffer +
2897                                                         2;
2898                                                 BUF_PUSH_3(start_memory, r, 0);
2899                                         }
2900
2901                                         compile_stack.avail++;
2902
2903                                         fixup_alt_jump = 0;
2904                                         laststart = 0;
2905                                         begalt = buf_end;
2906                                         /* If we've reached MAX_REGNUM groups,
2907                                            then this open won't actually
2908                                            generate any code, so we'll have to
2909                                            clear pending_exact explicitly.  */
2910                                         pending_exact = 0;
2911                                 }
2912                                 break;
2913
2914                         case ')':
2915                                 if (syntax & RE_NO_BK_PARENS)
2916                                         goto normal_backslash;
2917
2918                                 if (COMPILE_STACK_EMPTY) {
2919                                         if (syntax &
2920                                             RE_UNMATCHED_RIGHT_PAREN_ORD) {
2921                                                 goto normal_backslash;
2922                                         } else {
2923                                                 free_stack();
2924                                                 return REG_ERPAREN;
2925                                         }
2926                                 }
2927
2928                               handle_close:
2929                                 if (fixup_alt_jump) {
2930                                         /* Push a dummy failure point at the end
2931                                            of the alternative for a possible
2932                                            future `pop_failure_jump' to pop.
2933                                            See comments at `push_dummy_failure'
2934                                            in `re_match_2'. */
2935                                         BUF_PUSH(push_dummy_failure);
2936
2937                                         /* We allocated space for this jump when
2938                                            we assigned to `fixup_alt_jump', in
2939                                            the `handle_alt' case below.  */
2940                                         STORE_JUMP(jump_past_alt,
2941                                                    fixup_alt_jump, buf_end - 1);
2942                                 }
2943
2944                                 /* See similar code for backslashed left paren
2945                                    above.  */
2946                                 if (COMPILE_STACK_EMPTY) {
2947                                         if (syntax &
2948                                             RE_UNMATCHED_RIGHT_PAREN_ORD) {
2949                                                 goto normal_char;
2950                                         } else {
2951                                                 free_stack();
2952                                                 return REG_EPAREN;
2953                                         }
2954                                 }
2955
2956                                 /* Since we just checked for an empty stack
2957                                    above, this ``can't happen''.  */
2958                                 assert(compile_stack.avail != 0);
2959                                 {
2960                                         /* We don't just want to restore into
2961                                            `regnum', because later groups should
2962                                            continue to be numbered higher, as in
2963                                            `(ab)c(de)' -- the second group is
2964                                            #2.  */
2965                                         regnum_t this_group_regnum;
2966
2967                                         compile_stack.avail--;
2968                                         begalt =
2969                                                 bufp->buffer +
2970                                                 COMPILE_STACK_TOP.begalt_offset;
2971                                         fixup_alt_jump =
2972                                                 COMPILE_STACK_TOP.
2973                                                 fixup_alt_jump
2974                                                 ? bufp->buffer +
2975                                                 COMPILE_STACK_TOP.
2976                                                 fixup_alt_jump - 1
2977                                                 : 0;
2978                                         laststart =
2979                                                 bufp->buffer +
2980                                                 COMPILE_STACK_TOP.
2981                                                 laststart_offset;
2982                                         this_group_regnum =
2983                                                 COMPILE_STACK_TOP.regnum;
2984                                         /* If we've reached MAX_REGNUM groups,
2985                                            then this open won't actually
2986                                            generate any code, so we'll have to
2987                                            clear pending_exact explicitly.  */
2988                                         pending_exact = 0;
2989
2990                                         /* We're at the end of the group, so now
2991                                            we know how many groups were inside
2992                                            this one.  */
2993                                         if (this_group_regnum <= MAX_REGNUM) {
2994                                                 unsigned char *inner_group_loc
2995                                                         =
2996                                                         bufp->buffer +
2997                                                         COMPILE_STACK_TOP.
2998                                                         inner_group_offset;
2999
3000                                                 *inner_group_loc =
3001                                                         regnum -
3002                                                         this_group_regnum;
3003                                                 BUF_PUSH_3(stop_memory,
3004                                                            this_group_regnum,
3005                                                            regnum -
3006                                                            this_group_regnum);
3007                                         }
3008                                 }
3009                                 break;
3010
3011                         case '|':       /* `\|'.  */
3012                                 if (syntax & RE_LIMITED_OPS
3013                                     || syntax & RE_NO_BK_VBAR) {
3014                                         goto normal_backslash;
3015                                 }
3016                         handle_alt:
3017                                 if (syntax & RE_LIMITED_OPS) {
3018                                         goto normal_char;
3019                                 }
3020
3021                                 /* Insert before the previous alternative a jump
3022                                    which jumps to this alternative if the former
3023                                    fails.  */
3024                                 GET_BUFFER_SPACE(3);
3025                                 INSERT_JUMP(on_failure_jump, begalt,
3026                                             buf_end + 6);
3027                                 pending_exact = 0;
3028                                 buf_end += 3;
3029
3030                                 /* The alternative before this one has a jump after it
3031                                    which gets executed if it gets matched.  Adjust that
3032                                    jump so it will jump to this alternative's analogous
3033                                    jump (put in below, which in turn will jump to the next
3034                                    (if any) alternative's such jump, etc.).  The last such
3035                                    jump jumps to the correct final destination.  A picture:
3036                                    _____ _____
3037                                    |   | |   |
3038                                    |   v |   v
3039                                    a | b   | c
3040
3041                                    If we are at `b', then fixup_alt_jump right now points to a
3042                                    three-byte space after `a'.  We'll put in the jump, set
3043                                    fixup_alt_jump to right after `b', and leave behind three
3044                                    bytes which we'll fill in when we get to after `c'.  */
3045
3046                                 if (fixup_alt_jump)
3047                                         STORE_JUMP(jump_past_alt,
3048                                                    fixup_alt_jump, buf_end);
3049
3050                                 /* Mark and leave space for a jump after this alternative,
3051                                    to be filled in later either by next alternative or
3052                                    when know we're at the end of a series of alternatives.  */
3053                                 fixup_alt_jump = buf_end;
3054                                 GET_BUFFER_SPACE(3);
3055                                 buf_end += 3;
3056
3057                                 laststart = 0;
3058                                 begalt = buf_end;
3059                                 break;
3060
3061                         case '{':
3062                                 /* If \{ is a literal.  */
3063                                 if (!(syntax & RE_INTERVALS)
3064                                     /* If we're at `\{' and it's not the open-interval
3065                                        operator.  */
3066                                     || ((syntax & RE_INTERVALS)
3067                                         && (syntax & RE_NO_BK_BRACES))
3068                                     || (p - 2 == pattern && p == pend))
3069                                         goto normal_backslash;
3070
3071                               handle_interval:
3072                                 {
3073                                         /* If got here, then the syntax allows
3074                                            intervals.  */
3075
3076                                         /* At least (most) this many matches
3077                                            must be made.  */
3078                                         int lower_bound = -1, upper_bound = -1;
3079
3080                                         beg_interval = p - 1;
3081
3082                                         if (p == pend) {
3083                                                 if (syntax & RE_NO_BK_BRACES) {
3084                                                         goto unfetch_interval;
3085                                                 } else {
3086                                                         free_stack();
3087                                                         return REG_EBRACE;
3088                                                 }
3089                                         }
3090
3091                                         GET_UNSIGNED_NUMBER(lower_bound);
3092
3093                                         if (c == ',') {
3094                                                 GET_UNSIGNED_NUMBER(
3095                                                         upper_bound);
3096                                                 if (upper_bound < 0) {
3097                                                         upper_bound =
3098                                                                 RE_DUP_MAX;
3099                                                 }
3100                                         } else {
3101                                                 /* Interval such as `{1}' =>
3102                                                    match exactly once. */
3103                                                 upper_bound = lower_bound;
3104                                         }
3105
3106                                         if (lower_bound < 0
3107                                             || upper_bound > RE_DUP_MAX
3108                                             || lower_bound > upper_bound) {
3109                                                 if (syntax & RE_NO_BK_BRACES) {
3110                                                         goto unfetch_interval;
3111                                                 } else {
3112                                                         free_stack();
3113                                                         return REG_BADBR;
3114                                                 }
3115                                         }
3116
3117                                         if (!(syntax & RE_NO_BK_BRACES)) {
3118                                                 if (c != '\\') {
3119                                                         free_stack();
3120                                                         return REG_EBRACE;
3121                                                 }
3122                                                 PATFETCH(c);
3123                                         }
3124
3125                                         if (c != '}') {
3126                                                 if (syntax & RE_NO_BK_BRACES) {
3127                                                         goto unfetch_interval;
3128                                                 } else {
3129                                                         free_stack();
3130                                                         return REG_BADBR;
3131                                                 }
3132                                         }
3133
3134                                         /* We just parsed a valid interval.  */
3135
3136                                         /* If it's invalid to have no preceding
3137                                            re.  */
3138                                         if (!laststart) {
3139                                                 if (syntax &
3140                                                     RE_CONTEXT_INVALID_OPS) {
3141                                                         free_stack();
3142                                                         return REG_BADRPT;
3143                                                 } else if (
3144                                                         syntax &
3145                                                         RE_CONTEXT_INDEP_OPS) {
3146                                                         laststart = buf_end;
3147                                                 } else {
3148                                                         goto unfetch_interval;
3149                                                 }
3150                                         }
3151
3152                                         /* If the upper bound is zero, don't
3153                                            want to succeed at all; jump from
3154                                            `laststart' to `b + 3', which will be
3155                                            the end of the buffer after we insert
3156                                            the jump.  */
3157                                         if (upper_bound == 0) {
3158                                                 GET_BUFFER_SPACE(3);
3159                                                 INSERT_JUMP(jump, laststart,
3160                                                             buf_end + 3);
3161                                                 buf_end += 3;
3162                                         }
3163
3164                                         /* Otherwise, we have a nontrivial interval.  When
3165                                            we're all done, the pattern will look like:
3166                                            set_number_at <jump count> <upper bound>
3167                                            set_number_at <succeed_n count> <lower bound>
3168                                            succeed_n <after jump addr> <succeed_n count>
3169                                            <body of loop>
3170                                            jump_n <succeed_n addr> <jump count>
3171                                            (The upper bound and `jump_n' are omitted if
3172                                            `upper_bound' is 1, though.)  */
3173                                         else {  /* If the upper bound is > 1, we need to insert
3174                                                    more at the end of the loop.  */
3175                                                 Memory_count nbytes =
3176                         &n