1 /* Extended regular expression matching and search library,
2 version 0.12, extended for XEmacs.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
6 Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
7 Copyright (C) 1995 Sun Microsystems, Inc.
8 Copyright (C) 1995 Ben Wing.
10 This file is part of SXEmacs
12 SXEmacs is free software: you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation, either version 3 of the License, or
15 (at your option) any later version.
17 SXEmacs is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program. If not, see <http://www.gnu.org/licenses/>. */
26 /* Synched up with: FSF 19.29. */
28 /* Changes made for XEmacs:
30 (1) the REGEX_BEGLINE_CHECK code from the XEmacs v18 regex routines
31 was added. This causes a huge speedup in font-locking.
32 (2) Rel-alloc is disabled when the MMAP version of rel-alloc is
33 being used, because it's too slow -- all those calls to mmap()
34 add humongous overhead.
35 (3) Lots and lots of changes for Mule. They are bracketed by
36 `#ifdef MULE' or with comments that have `XEmacs' in them.
43 #ifndef REGISTER /* Rigidly enforced as of 20.3 */
52 /* Converts the pointer to the char to BEG-based offset from the start. */
53 #define PTR_TO_OFFSET(d) (MATCHING_IN_FIRST_STRING \
54 ? (d) - string1 : (d) - (string2 - size1))
56 #define PTR_TO_OFFSET(d) 0
59 /* We assume non-Mule if emacs isn't defined. */
64 /* We need this for `regex.h', and perhaps for the Emacs include files. */
65 #include <sys/types.h>
67 /* This is for other GNU distributions with internationalized messages. */
68 #if defined (I18N3) && (defined (HAVE_LIBINTL_H) || defined (_LIBC))
71 # define gettext(msgid) (msgid)
74 /* XEmacs: define this to add in a speedup for patterns anchored at
75 the beginning of a line. Keep the ifdefs so that it's easier to
76 tell where/why this code has diverged from v19. */
77 #define REGEX_BEGLINE_CHECK
79 /* XEmacs: the current mmap-based ralloc handles small blocks very
80 poorly, so we disable it here. */
82 #if (defined (REL_ALLOC) && defined (HAVE_MMAP)) || defined(DOUG_LEA_MALLOC)
86 /* The `emacs' switch turns on certain matching commands
87 that make sense only in Emacs. */
96 Lisp_Object Vthe_lisp_rangetab;
98 void complex_vars_of_regex(void)
100 Vthe_lisp_rangetab = Fmake_range_table();
101 staticpro(&Vthe_lisp_rangetab);
106 void complex_vars_of_regex(void)
112 #define RE_TRANSLATE(ch) TRT_TABLE_OF (translate, (Emchar) ch)
113 #define TRANSLATE_P(tr) (!NILP (tr) && CHAR_TABLEP(tr))
115 /* just massage that xfree thing */
117 #if defined HAVE_BDWGC && defined EF_USE_BDWGC
125 /* If we are not linking with Emacs proper,
126 we can't use the relocating allocator
127 even if config.h says that we can. */
130 #if defined (STDC_HEADERS) || defined (_LIBC)
132 # include <stdbool.h>
138 /* Types normally included via lisp.h */
139 #include <stddef.h> /* for ptrdiff_t */
142 #ifndef DECLARE_NOTHING
143 #define DECLARE_NOTHING struct nosuchstruct
149 #define charptr_emchar(str) ((Emchar) (str)[0])
151 #define INC_CHARPTR(p) ((p)++)
152 #define DEC_CHARPTR(p) ((p)--)
156 /* Define the syntax stuff for \<, \>, etc. */
158 /* This must be nonzero for the wordchar and notwordchar pattern
159 commands in re_match_2. */
166 extern char *re_syntax_table;
168 #else /* not SYNTAX_TABLE */
170 /* How many characters in the character set. */
171 #define CHAR_SET_SIZE 256
173 static char re_syntax_table[CHAR_SET_SIZE];
175 static void init_syntax_once(void)
180 const char *word_syntax_chars =
181 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
183 memset(re_syntax_table, 0, sizeof(re_syntax_table));
185 while (*word_syntax_chars)
186 re_syntax_table[(unsigned int)(*word_syntax_chars++)] =
193 #endif /* SYNTAX_TABLE */
195 #define SYNTAX_UNSAFE(ignored, c) re_syntax_table[c]
196 #undef SYNTAX_FROM_CACHE
197 #define SYNTAX_FROM_CACHE SYNTAX_UNSAFE
199 #define RE_TRANSLATE(c) translate[(unsigned char) (c)]
200 #define TRANSLATE_P(tr) tr
202 #if defined HAVE_BDWGC && defined EF_USE_BDWGC
203 # if defined HAVE_GC_GC_H
205 # elif defined HAVE_GC_H
208 # define xmalloc GC_MALLOC
209 # define xmalloc_atomic GC_MALLOC_ATOMIC
210 # define xrealloc GC_REALLOC
211 # define xfree GC_FREE
212 # define xstrdup GC_STRDUP
213 #else /* !HAVE_BDWGC */
214 # define xmalloc malloc
215 # define xmalloc_atomic malloc
216 # define xrealloc realloc
218 # define xstrdup strdup
219 #endif /* HAVE_BDWGC */
224 /* Under XEmacs, this is needed because we don't define it elsewhere. */
225 #ifdef SWITCH_ENUM_BUG
226 #define SWITCH_ENUM_CAST(x) ((int)(x))
228 #define SWITCH_ENUM_CAST(x) (x)
231 /* Get the interface, including the syntax bits. */
234 /* isalpha etc. are used for the character classes. */
237 /* Jim Meyering writes:
239 "... Some ctype macros are valid only for character codes that
240 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
241 using /bin/cc or gcc but without giving an ansi option). So, all
242 ctype uses should be through macros like ISPRINT... If
243 STDC_HEADERS is defined, then autoconf has verified that the ctype
244 macros don't need to be guarded with references to isascii. ...
245 Defining isascii to 1 should let any compiler worth its salt
246 eliminate the && through constant folding." */
248 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
249 #define ISASCII_1(c) 1
251 #define ISASCII_1(c) isascii(c)
255 /* The IS*() macros can be passed any character, including an extended
256 one. We need to make sure there are no crashes, which would occur
257 otherwise due to out-of-bounds array references. */
258 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c))
260 #define ISASCII(c) ISASCII_1 (c)
264 #define ISBLANK(c) (ISASCII (c) && isblank (c))
266 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
269 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
271 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
274 /* stolen from FSF/src/charset.h */
275 #define SINGLE_BYTE_CHAR_P(c) (((unsigned int)(c) & 0xFF) == (c))
276 /* 1 if C is an ASCII character. */
277 #define IS_REAL_ASCII(c) ((c) < 128)
278 /* 1 if C is a unibyte character. */
279 #define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P(c))
281 #define ISPRINT(c) (ISASCII (c) && isprint (c))
282 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
283 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
284 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
285 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
286 #define ISLOWER(c) (ISASCII (c) && islower (c))
287 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
288 #define ISSPACE(c) (ISASCII (c) && isspace (c))
289 #define ISUPPER(c) (ISASCII (c) && isupper (c))
290 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
295 __attribute__((always_inline));
299 Lisp_Object ct = regex_emacs_buffer->mirror_syntax_table;
301 return (SYNTAX(XCHAR_TABLE(ct), c) == Sword);
304 # define ISWORD(c) (ISALPHA(c))
308 #define NULL (void *)0
311 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
312 since ours (we hope) works properly with all combinations of
313 machines, compilers, `char' and `unsigned char' argument types.
314 (Per Bothner suggested the basic approach.) */
315 #undef SIGN_EXTEND_CHAR
317 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
318 #else /* not __STDC__ */
319 /* As in Harbison and Steele. */
320 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
323 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
324 use `alloca' instead of `malloc'. This is because using malloc in
325 re_search* or re_match* could cause memory leaks when C-g is used in
326 Emacs; also, malloc is slower and causes storage fragmentation. On
327 the other hand, malloc is more portable, and easier to debug.
329 Because we sometimes use alloca, some routines have to be macros,
330 not functions -- `alloca'-allocated space disappears at the end of the
331 function it is called in. */
335 #define REGEX_ALLOCATE xmalloc
336 #define REGEX_REALLOCATE(source, osize, nsize) xrealloc(source, nsize)
337 #define REGEX_FREE xfree
339 #else /* !REGEX_MALLOC */
341 /* Emacs already defines alloca, sometimes. */
344 /* Make alloca work the best possible way. */
346 #define alloca __builtin_alloca
347 #else /* not __GNUC__ */
350 #else /* not __GNUC__ or HAVE_ALLOCA_H */
351 #ifndef _AIX /* Already did AIX, up at the top. */
353 #endif /* not _AIX */
354 #endif /* HAVE_ALLOCA_H */
355 #endif /* __GNUC__ */
357 #endif /* not alloca */
359 #define REGEX_ALLOCATE alloca
361 /* Assumes a `char *destination' variable. */
362 #define REGEX_REALLOCATE(source, osize, nsize) \
363 (destination = (char *) alloca (nsize), \
364 memmove (destination, source, osize), \
367 /* No need to do anything to free, after alloca. */
368 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
370 #endif /* REGEX_MALLOC */
372 /* Define how to allocate the failure stack. */
376 #define REGEX_ALLOCATE_STACK(size) \
377 r_alloc ((char **) &failure_stack_ptr, (size))
378 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
379 r_re_alloc ((char **) &failure_stack_ptr, (nsize))
380 #define REGEX_FREE_STACK(ptr) \
381 r_alloc_free ((void **) &failure_stack_ptr)
383 #else /* !REL_ALLOC */
387 #define REGEX_ALLOCATE_STACK xmalloc
388 #define REGEX_REALLOCATE_STACK(source, osize, nsize) xrealloc(source, nsize)
389 #define REGEX_FREE_STACK xfree
391 #else /* !REGEX_MALLOC */
393 #define REGEX_ALLOCATE_STACK alloca
395 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
396 REGEX_REALLOCATE (source, osize, nsize)
397 /* No need to explicitly free anything. */
398 #define REGEX_FREE_STACK(arg)
400 #endif /* REGEX_MALLOC */
401 #endif /* REL_ALLOC */
403 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
404 `string1' or just past its end. This works if PTR is NULL, which is
406 #define FIRST_STRING_P(ptr) \
407 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
409 /* (Re)Allocate N items of type T using malloc, or fail. */
410 #define TALLOC(n, t) \
411 ((t *)xmalloc((n) * sizeof (t)))
412 #define TALLOC_ATOMIC(n, t) \
413 ((t *)xmalloc_atomic((n) * sizeof (t)))
414 #define RETALLOC(addr, n, t) \
415 ((addr) = (t *)xrealloc(addr, (n) * sizeof (t)))
416 #define RETALLOC_IF(addr, n, t) \
418 RETALLOC((addr), (n), t); \
420 (addr) = TALLOC ((n), t)
421 #define REGEX_TALLOC(n, t) \
422 ((t *)REGEX_ALLOCATE((n) * sizeof (t)))
424 #define BYTEWIDTH 8 /* In bits. */
426 #define STREQ(s1, s2) (strcmp (s1, s2) == 0)
430 #define MAX(a, b) ((a) > (b) ? (a) : (b))
431 #define MIN(a, b) ((a) < (b) ? (a) : (b))
433 /* Type of source-pattern and string chars. */
434 typedef const unsigned char re_char;
436 typedef char re_bool;
440 /* These are the command codes that appear in compiled regular
441 expressions. Some opcodes are followed by argument bytes. A
442 command code can specify any interpretation whatsoever for its
443 arguments. Zero bytes may appear in the compiled regular expression. */
448 /* Succeed right away--no more backtracking. */
451 /* Followed by one byte giving n, then by n literal bytes. */
454 /* Matches any (more or less) character. */
457 /* Matches any one char belonging to specified set. First
458 following byte is number of bitmap bytes. Then come bytes
459 for a bitmap saying which chars are in. Bits in each byte
460 are ordered low-bit-first. A character is in the set if its
461 bit is 1. A character too large to have a bit in the map is
462 automatically not in the set. */
465 /* Same parameters as charset, but match any character that is
466 not one of those specified. */
469 /* Start remembering the text that is matched, for storing in a
470 register. Followed by one byte with the register number, in
471 the range 1 to the pattern buffer's re_ngroups
472 field. Then followed by one byte with the number of groups
473 inner to this one. (This last has to be part of the
474 start_memory only because we need it in the on_failure_jump
478 /* Stop remembering the text that is matched and store it in a
479 memory register. Followed by one byte with the register
480 number, in the range 1 to `re_ngroups' in the
481 pattern buffer, and one byte with the number of inner groups,
482 just like `start_memory'. (We need the number of inner
483 groups here because we don't have any easy way of finding the
484 corresponding start_memory when we're at a stop_memory.) */
487 /* Match a duplicate of something remembered. Followed by one
488 byte containing the register number. */
491 /* Fail unless at beginning of line. */
494 /* Fail unless at end of line. */
497 /* Succeeds if at beginning of buffer (if emacs) or at beginning
498 of string to be matched (if not). */
501 /* Analogously, for end of buffer/string. */
504 /* Followed by two byte relative address to which to jump. */
507 /* Same as jump, but marks the end of an alternative. */
510 /* Followed by two-byte relative address of place to resume at
511 in case of failure. */
514 /* Like on_failure_jump, but pushes a placeholder instead of the
515 current string position when executed. */
516 on_failure_keep_string_jump,
518 /* Throw away latest failure point and then jump to following
519 two-byte relative address. */
522 /* Change to pop_failure_jump if know won't have to backtrack to
523 match; otherwise change to jump. This is used to jump
524 back to the beginning of a repeat. If what follows this jump
525 clearly won't match what the repeat does, such that we can be
526 sure that there is no use backtracking out of repetitions
527 already matched, then we change it to a pop_failure_jump.
528 Followed by two-byte address. */
531 /* Jump to following two-byte address, and push a dummy failure
532 point. This failure point will be thrown away if an attempt
533 is made to use it for a failure. A `+' construct makes this
534 before the first repeat. Also used as an intermediary kind
535 of jump when compiling an alternative. */
538 /* Push a dummy failure point and continue. Used at the end of
542 /* Followed by two-byte relative address and two-byte number n.
543 After matching N times, jump to the address upon failure. */
546 /* Followed by two-byte relative address, and two-byte number n.
547 Jump to the address N times, then fail. */
550 /* Set the following two-byte relative address to the
551 subsequent two-byte number. The address *includes* the two
555 wordchar, /* Matches any word-constituent character. */
556 notwordchar, /* Matches any char that is not a word-constituent. */
558 wordbeg, /* Succeeds if at word beginning. */
559 wordend, /* Succeeds if at word end. */
561 wordbound, /* Succeeds if at a word boundary. */
562 notwordbound /* Succeeds if not at a word boundary. */
564 , before_dot, /* Succeeds if before point. */
565 at_dot, /* Succeeds if at point. */
566 after_dot, /* Succeeds if after point. */
568 /* Matches any character whose syntax is specified. Followed by
569 a byte which contains a syntax code, e.g., Sword. */
572 /* Matches any character whose syntax is not that specified. */
576 /* need extra stuff to be able to properly work with XEmacs/Mule
577 characters (which may take up more than one byte) */
578 , charset_mule, /* Matches any character belonging to specified set.
579 The set is stored in "unified range-table
580 format"; see rangetab.c. Unlike the `charset'
581 opcode, this can handle arbitrary characters. */
583 charset_mule_not /* Same parameters as charset_mule, but match any
584 character that is not one of those specified. */
585 /* 97/2/17 jhod: The following two were merged back in from the Mule
586 2.3 code to enable some language specific processing */
587 , categoryspec, /* Matches entries in the character category tables */
588 notcategoryspec /* The opposite of the above */
592 /* Common operations on the compiled pattern. */
594 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
596 #define STORE_NUMBER(destination, number) \
598 (destination)[0] = (number) & 0377; \
599 (destination)[1] = (number) >> 8; \
602 /* Same as STORE_NUMBER, except increment DESTINATION to
603 the byte after where the number is stored. Therefore, DESTINATION
604 must be an lvalue. */
606 #define STORE_NUMBER_AND_INCR(destination, number) \
608 STORE_NUMBER (destination, number); \
609 (destination) += 2; \
612 /* Put into DESTINATION a number stored in two contiguous bytes starting
615 #define EXTRACT_NUMBER(destination, source) \
617 (destination) = *(source) & 0377; \
618 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
621 #ifdef REGEX_DEBUG_FLAG
622 static void extract_number(int *dest, re_char * source)
624 int temp = SIGN_EXTEND_CHAR(*(source + 1));
625 *dest = *source & 0377;
629 #ifndef EXTRACT_MACROS /* To debug the macros. */
630 #undef EXTRACT_NUMBER
631 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
632 #endif /* not EXTRACT_MACROS */
634 #endif /* REGEX_DEBUG_FLAG */
636 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
637 SOURCE must be an lvalue. */
639 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
641 EXTRACT_NUMBER (destination, source); \
645 #ifdef REGEX_DEBUG_FLAG
647 extract_number_and_incr(int *destination, unsigned char **source)
649 extract_number(destination, *source);
653 #ifndef EXTRACT_MACROS
654 #undef EXTRACT_NUMBER_AND_INCR
655 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
656 extract_number_and_incr (&dest, &src)
657 #endif /* not EXTRACT_MACROS */
659 #endif /* REGEX_DEBUG_FLAG */
661 /* If REGEX_DEBUG_FLAG is defined, Regex prints many voluminous messages about
662 what it is doing (if the variable `debug' is nonzero). If linked with the
663 main program in `iregex.c', you can enter patterns and strings interactively.
664 And if linked with the main program in `main.c' and the other test files, you
665 can run the already-written tests. */
667 #if defined (REGEX_DEBUG_FLAG)
669 /* We use standard I/O for debugging. */
673 /* XEmacs provides its own version of assert() */
674 /* It is useful to test things that ``must'' be true when debugging. */
679 /* no point in having this one, since we do not offer any accessor to this */
680 static int debug = 0;
683 #define DEBUG_STATEMENT(e) e
684 #define DEBUG_PRINT1(x) printf(x)
685 #define DEBUG_PRINT2(x1, x2) printf(x1, x2)
686 #define DEBUG_PRINT3(x1, x2, x3) printf(x1, x2, x3)
687 #define DEBUG_PRINT4(x1, x2, x3, x4) printf(x1, x2, x3, x4)
688 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
689 print_partial_compiled_pattern(s, e)
690 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
691 print_double_string (w, s1, sz1, s2, sz2)
693 /* Print the fastmap in human-readable form. */
695 static void print_fastmap(char *fastmap)
697 unsigned was_a_range = 0;
700 while (i < (1 << BYTEWIDTH)) {
704 while (i < (1 << BYTEWIDTH) && fastmap[i]) {
717 /* Print a compiled pattern string in human-readable form, starting at
718 the START pointer into it and ending just before the pointer END. */
720 /* can't help it ... the ugly mule code changes `start' upon alignment,
721 * so this can't have the const qualifier ... bugger */
723 print_partial_compiled_pattern(unsigned char *start, re_char *end)
726 unsigned char *p = start;
734 /* Loop over pattern commands. */
736 printf("%ld:\t", (long)(p - start));
738 switch ((const re_opcode_t)*p++) {
745 printf("/exactn/%d", mcnt);
755 printf("/start_memory/%d/%d", mcnt, *p++);
760 printf("/stop_memory/%d/%d", mcnt, *p++);
764 printf("/duplicate/%d", *p++);
773 REGISTER int c, last = -100;
774 REGISTER int in_range = 0;
776 printf("/charset [%s",
777 (re_opcode_t) * (p - 1) == charset_not
780 assert(p + *p < pend);
782 for (c = 0; c < 256; c++)
783 if (((unsigned char)(c / 8) < *p)
786 /* Are we starting a range? */
787 if (last + 1 == c && !in_range) {
791 /* Have we broken a range? */
792 else if (last + 1 != c
815 case charset_mule_not: {
818 printf("/charset_mule [%s",
819 (re_opcode_t) * (p - 1) == charset_mule_not
821 /* u_r_t_nentries() CHANGES its arg, why on earth
822 * is this marked const here then? :O -hrop */
823 nentries = unified_range_table_nentries(p);
824 for (i = 0; i < nentries; i++) {
825 EMACS_INT first, last;
826 Lisp_Object dummy_val;
828 /* u_r_t_get_range CHANGES its arg ...
829 * p can't be const thence */
830 unified_range_table_get_range(p, i,
837 printf("(0x%lx)", (long)first);
848 p += unified_range_table_bytes_used(p);
861 case on_failure_jump:
862 extract_number_and_incr(&mcnt, &p);
863 printf("/on_failure_jump to %ld",
864 (long)(p + mcnt - start));
867 case on_failure_keep_string_jump:
868 extract_number_and_incr(&mcnt, &p);
869 printf("/on_failure_keep_string_jump to %ld",
870 (long)(p + mcnt - start));
873 case dummy_failure_jump:
874 extract_number_and_incr(&mcnt, &p);
875 printf("/dummy_failure_jump to %ld",
876 (long)(p + mcnt - start));
879 case push_dummy_failure:
880 printf("/push_dummy_failure");
884 extract_number_and_incr(&mcnt, &p);
885 printf("/maybe_pop_jump to %ld",
886 (long)(p + mcnt - start));
889 case pop_failure_jump:
890 extract_number_and_incr(&mcnt, &p);
891 printf("/pop_failure_jump to %ld",
892 (long)(p + mcnt - start));
896 extract_number_and_incr(&mcnt, &p);
897 printf("/jump_past_alt to %ld",
898 (long)(p + mcnt - start));
902 extract_number_and_incr(&mcnt, &p);
903 printf("/jump to %ld", (long)(p + mcnt - start));
907 extract_number_and_incr(&mcnt, &p);
908 extract_number_and_incr(&mcnt2, &p);
909 printf("/succeed_n to %ld, %d times",
910 (long)(p + mcnt - start), mcnt2);
914 extract_number_and_incr(&mcnt, &p);
915 extract_number_and_incr(&mcnt2, &p);
916 printf("/jump_n to %ld, %d times",
917 (long)(p + mcnt - start), mcnt2);
921 extract_number_and_incr(&mcnt, &p);
922 extract_number_and_incr(&mcnt2, &p);
923 printf("/set_number_at location %ld to %d",
924 (long)(p + mcnt - start), mcnt2);
928 printf("/wordbound");
932 printf("/notwordbound");
944 printf("/before_dot");
952 printf("/after_dot");
956 printf("/syntaxspec");
962 printf("/notsyntaxspec");
968 /* 97/2/17 jhod Mule category patch */
970 printf("/categoryspec");
975 case notcategoryspec:
976 printf("/notcategoryspec");
980 /* end of category patch */
989 printf("/notwordchar");
1002 printf("?%d", *(p - 1));
1008 printf("%ld:\tend of pattern.\n", (long)(p - start));
1012 print_compiled_pattern(struct re_pattern_buffer *bufp)
1016 re_char *buffer = bufp->buffer;
1018 unsigned char *buffer = bufp->buffer;
1021 print_partial_compiled_pattern(buffer, buffer + bufp->used);
1022 printf("%ld bytes used/%ld bytes allocated.\n", bufp->used,
1025 if (bufp->fastmap_accurate && bufp->fastmap) {
1026 printf("fastmap: ");
1027 print_fastmap(bufp->fastmap);
1030 printf("re_nsub: %ld\t", (long)bufp->re_nsub);
1031 printf("re_ngroups: %ld\t", (long)bufp->re_ngroups);
1032 printf("regs_alloc: %d\t", bufp->regs_allocated);
1033 printf("can_be_null: %d\t", bufp->can_be_null);
1034 printf("newline_anchor: %d\n", bufp->newline_anchor);
1035 printf("no_sub: %d\t", bufp->no_sub);
1036 printf("not_bol: %d\t", bufp->not_bol);
1037 printf("not_eol: %d\t", bufp->not_eol);
1038 printf("syntax: %d\n", bufp->syntax);
1039 /* Perhaps we should print the translate table? */
1040 /* and maybe the category table? */
1042 if (bufp->external_to_internal_register) {
1045 printf("external_to_internal_register:\n");
1046 for (i = 0; i <= bufp->re_nsub; i++) {
1049 printf("%d -> %d", i,
1050 bufp->external_to_internal_register[i]);
1057 print_double_string(re_char * where, re_char * string1, int size1,
1058 re_char * string2, int size2)
1063 Element_count this_char;
1065 if (FIRST_STRING_P(where)) {
1066 for (this_char = where - string1; this_char < size1;
1068 putchar(string1[this_char]);
1073 for (this_char = where - string2; this_char < size2;
1075 putchar(string2[this_char]);
1079 #else /* not REGEX_DEBUG_FLAG */
1084 #define DEBUG_STATEMENT(e)
1085 #define DEBUG_PRINT1(x)
1086 #define DEBUG_PRINT2(x1, x2)
1087 #define DEBUG_PRINT3(x1, x2, x3)
1088 #define DEBUG_PRINT4(x1, x2, x3, x4)
1089 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1090 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1092 #endif /* REGEX_DEBUG_FLAG */
1094 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1095 also be assigned to arbitrarily: each pattern buffer stores its own
1096 syntax, so it can be changed between regex compilations. */
1097 /* This has no initializer because initialized variables in Emacs
1098 become read-only after dumping. */
1099 reg_syntax_t re_syntax_options;
1101 /* Specify the precise syntax of regexps for compilation. This provides
1102 for compatibility for various utilities which historically have
1103 different, incompatible syntaxes.
1105 The argument SYNTAX is a bit mask comprised of the various bits
1106 defined in regex.h. We return the old syntax. */
1108 reg_syntax_t re_set_syntax(reg_syntax_t syntax)
1110 reg_syntax_t ret = re_syntax_options;
1112 re_syntax_options = syntax;
1116 /* This table gives an error message for each of the error codes listed
1117 in regex.h. Obviously the order here has to be same as there.
1118 POSIX doesn't require that we do anything for REG_NOERROR,
1119 but why not be nice? */
1121 static const char *re_error_msgid[] = {
1122 "Success", /* REG_NOERROR */
1123 "No match", /* REG_NOMATCH */
1124 "Invalid regular expression", /* REG_BADPAT */
1125 "Invalid collation character", /* REG_ECOLLATE */
1126 "Invalid character class name", /* REG_ECTYPE */
1127 "Trailing backslash", /* REG_EESCAPE */
1128 "Invalid back reference", /* REG_ESUBREG */
1129 "Unmatched [ or [^", /* REG_EBRACK */
1130 "Unmatched ( or \\(", /* REG_EPAREN */
1131 "Unmatched \\{", /* REG_EBRACE */
1132 "Invalid content of \\{\\}", /* REG_BADBR */
1133 "Invalid range end", /* REG_ERANGE */
1134 "Memory exhausted", /* REG_ESPACE */
1135 "Invalid preceding regular expression", /* REG_BADRPT */
1136 "Premature end of regular expression", /* REG_EEND */
1137 "Regular expression too big", /* REG_ESIZE */
1138 "Unmatched ) or \\)", /* REG_ERPAREN */
1140 "Invalid syntax designator", /* REG_ESYNTAX */
1143 "Ranges may not span charsets", /* REG_ERANGESPAN */
1144 "Invalid category designator", /* REG_ECATEGORY */
1148 /* Avoiding alloca during matching, to placate r_alloc. */
1150 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1151 searching and matching functions should not call alloca. On some
1152 systems, alloca is implemented in terms of malloc, and if we're
1153 using the relocating allocator routines, then malloc could cause a
1154 relocation, which might (if the strings being searched are in the
1155 ralloc heap) shift the data out from underneath the regexp
1158 Here's another reason to avoid allocation: Emacs
1159 processes input from X in a signal handler; processing X input may
1160 call malloc; if input arrives while a matching routine is calling
1161 malloc, then we're scrod. But Emacs can't just block input while
1162 calling matching routines; then we don't notice interrupts when
1163 they come in. So, Emacs blocks input around all regexp calls
1164 except the matching calls, which it leaves unprotected, in the
1165 faith that they will not malloc. */
1167 /* Normally, this is fine. */
1168 #define MATCH_MAY_ALLOCATE
1170 /* When using GNU C, we are not REALLY using the C alloca, no matter
1171 what config.h may say. So don't take precautions for it. */
1176 /* The match routines may not allocate if (1) they would do it with malloc
1177 and (2) it's not safe for them to use malloc.
1178 Note that if REL_ALLOC is defined, matching would not use malloc for the
1179 failure stack, but we would still use it for the register vectors;
1180 so REL_ALLOC should not affect this. */
1181 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1182 #undef MATCH_MAY_ALLOCATE
1185 /* Failure stack declarations and macros; both re_compile_fastmap and
1186 re_match_2 use a failure stack. These have to be macros because of
1187 REGEX_ALLOCATE_STACK. */
1189 /* Number of failure points for which to initially allocate space
1190 when matching. If this number is exceeded, we allocate more
1191 space, so it is not a hard limit. */
1192 #ifndef INIT_FAILURE_ALLOC
1193 #define INIT_FAILURE_ALLOC 20
1196 /* Roughly the maximum number of failure points on the stack. Would be
1197 exactly that if always used MAX_FAILURE_SPACE each time we failed.
1198 This is a variable only so users of regex can assign to it; we never
1199 change it ourselves. */
1200 #if defined (MATCH_MAY_ALLOCATE) || defined (REGEX_MALLOC)
1201 /* 4400 was enough to cause a crash on Alpha OSF/1,
1202 whose default stack limit is 2mb. */
1203 int re_max_failures = 50000;
1205 int re_max_failures = 2000;
1208 union fail_stack_elt {
1209 const unsigned char *pointer;
1213 typedef union fail_stack_elt fail_stack_elt_t;
1216 fail_stack_elt_t *stack;
1218 Element_count avail; /* Offset of next open position. */
1221 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1222 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1223 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1225 /* Define macros to initialize and free the failure stack.
1226 Do `return -2' if the alloc fails. */
1228 #ifdef MATCH_MAY_ALLOCATE
1229 #define INIT_FAIL_STACK() \
1231 fail_stack.stack = (fail_stack_elt_t *) \
1232 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * \
1233 sizeof (fail_stack_elt_t)); \
1235 if (fail_stack.stack == NULL) \
1238 fail_stack.size = INIT_FAILURE_ALLOC; \
1239 fail_stack.avail = 0; \
1242 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1244 #define INIT_FAIL_STACK() \
1246 fail_stack.avail = 0; \
1249 #define RESET_FAIL_STACK()
1252 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1254 Return 1 if succeeds, and 0 if either ran out of memory
1255 allocating space for it or it was already too large.
1257 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1259 #define DOUBLE_FAIL_STACK(fail_stack) \
1260 ((int) (fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
1262 : ((fail_stack).stack = (fail_stack_elt_t *) \
1263 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1264 (fail_stack).size * sizeof (fail_stack_elt_t), \
1265 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1267 (fail_stack).stack == NULL \
1269 : ((fail_stack).size <<= 1, \
1272 /* Push pointer POINTER on FAIL_STACK.
1273 Return 1 if was able to do so and 0 if ran out of memory allocating
1275 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1276 ((FAIL_STACK_FULL () \
1277 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1279 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1282 /* Push a pointer value onto the failure stack.
1283 Assumes the variable `fail_stack'. Probably should only
1284 be called from within `PUSH_FAILURE_POINT'. */
1285 #define PUSH_FAILURE_POINTER(item) \
1286 fail_stack.stack[fail_stack.avail++].pointer = \
1287 (const unsigned char*)(item)
1289 /* This pushes an integer-valued item onto the failure stack.
1290 Assumes the variable `fail_stack'. Probably should only
1291 be called from within `PUSH_FAILURE_POINT'. */
1292 #define PUSH_FAILURE_INT(item) \
1293 fail_stack.stack[fail_stack.avail++].integer = (item)
1295 /* Push a fail_stack_elt_t value onto the failure stack.
1296 Assumes the variable `fail_stack'. Probably should only
1297 be called from within `PUSH_FAILURE_POINT'. */
1298 #define PUSH_FAILURE_ELT(item) \
1299 fail_stack.stack[fail_stack.avail++] = (item)
1301 /* These three POP... operations complement the three PUSH... operations.
1302 All assume that `fail_stack' is nonempty. */
1303 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1304 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1305 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1307 /* Used to omit pushing failure point id's when we're not debugging. */
1308 #ifdef REGEX_DEBUG_FLAG
1309 #define DEBUG_PUSH PUSH_FAILURE_INT
1310 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1312 #define DEBUG_PUSH(item)
1313 #define DEBUG_POP(item_addr)
1316 /* Push the information about the state we will need
1317 if we ever fail back to it.
1319 Requires variables fail_stack, regstart, regend, reg_info, and
1320 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
1323 Does `return FAILURE_CODE' if runs out of memory. */
1325 #if !defined (REGEX_MALLOC) && !defined (REL_ALLOC)
1326 #define DECLARE_DESTINATION char *destination
1328 #define DECLARE_DESTINATION DECLARE_NOTHING
1331 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1333 DECLARE_DESTINATION; \
1334 /* Must be int, so when we don't save any registers, the\
1335 arithmetic \ of 0 + -1 isn't done as unsigned. */ \
1338 DEBUG_STATEMENT (failure_id++); \
1339 DEBUG_STATEMENT (nfailure_points_pushed++); \
1340 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1341 DEBUG_PRINT2 (" Before push, next avail: %lu\n", \
1342 (long unsigned int)(fail_stack).avail); \
1343 DEBUG_PRINT2 (" size: %lu\n", \
1344 (long unsigned int)(fail_stack).size); \
1346 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
1347 DEBUG_PRINT2 (" available: %ld\n", \
1348 (long) REMAINING_AVAIL_SLOTS); \
1350 /* Ensure we have enough space allocated for what \
1351 * we will push. */ \
1352 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) { \
1353 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1354 return failure_code; \
1356 DEBUG_PRINT2 ("\n Doubled stack; size now: %lu\n", \
1357 (unsigned long) (fail_stack).size); \
1358 DEBUG_PRINT2 (" slots available: %ld\n", \
1359 (long) REMAINING_AVAIL_SLOTS); \
1362 /* Push the info, starting with the registers. */ \
1363 DEBUG_PRINT1 ("\n"); \
1365 for (this_reg = lowest_active_reg; \
1366 this_reg <= highest_active_reg; \
1368 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1369 DEBUG_STATEMENT (num_regs_pushed++); \
1371 DEBUG_PRINT2(" start: %p\n", \
1372 regstart[this_reg]); \
1373 PUSH_FAILURE_POINTER(regstart[this_reg]); \
1375 DEBUG_PRINT2 (" end: %p\n", \
1376 regend[this_reg]); \
1377 PUSH_FAILURE_POINTER(regend[this_reg]); \
1379 DEBUG_PRINT2 (" info: 0x%p\n ", \
1380 ®_info[this_reg]); \
1381 DEBUG_PRINT2 (" match_null=%d", \
1382 REG_MATCH_NULL_STRING_P \
1383 (reg_info[this_reg])); \
1384 DEBUG_PRINT2 (" active=%d", \
1385 IS_ACTIVE (reg_info[this_reg])); \
1386 DEBUG_PRINT2 (" matched_something=%d", \
1388 (reg_info[this_reg])); \
1389 DEBUG_PRINT2 (" ever_matched_something=%d", \
1390 EVER_MATCHED_SOMETHING \
1391 (reg_info[this_reg])); \
1392 DEBUG_PRINT1 ("\n"); \
1393 PUSH_FAILURE_ELT(reg_info[this_reg].word); \
1396 DEBUG_PRINT2 (" Pushing low active reg: %d\n", \
1397 lowest_active_reg); \
1398 PUSH_FAILURE_INT (lowest_active_reg); \
1400 DEBUG_PRINT2 (" Pushing high active reg: %d\n", \
1401 highest_active_reg); \
1402 PUSH_FAILURE_INT (highest_active_reg); \
1404 DEBUG_PRINT2 (" Pushing pattern 0x%lx: \n", \
1405 (long) pattern_place); \
1406 DEBUG_PRINT_COMPILED_PATTERN(bufp, pattern_place, pend); \
1407 PUSH_FAILURE_POINTER (pattern_place); \
1409 DEBUG_PRINT2(" Pushing string %p: `", \
1411 DEBUG_PRINT_DOUBLE_STRING(string_place, \
1414 DEBUG_PRINT1("'\n"); \
1415 PUSH_FAILURE_POINTER (string_place); \
1417 DEBUG_PRINT2(" Pushing failure id: %u\n", failure_id); \
1418 DEBUG_PUSH(failure_id); \
1421 /* This is the number of items that are pushed and popped on the stack
1422 for each register. */
1423 #define NUM_REG_ITEMS 3
1425 /* Individual items aside from the registers. */
1427 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1429 #define NUM_NONREG_ITEMS 4
1432 /* We push at most this many items on the stack. */
1433 /* We used to use (num_regs - 1), which is the number of registers
1434 this regexp will save; but that was changed to 5
1435 to avoid stack overflow for a regexp with lots of parens. */
1436 #define MAX_FAILURE_ITEMS (5 * (NUM_REG_ITEMS + NUM_NONREG_ITEMS))
1438 /* We actually push this many items. */
1439 #define NUM_FAILURE_ITEMS \
1440 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
1443 /* How many items can still be added to the stack without overflowing it. */
1444 #define REMAINING_AVAIL_SLOTS ((int) ((fail_stack).size - (fail_stack).avail))
1446 /* Pops what PUSH_FAIL_STACK pushes.
1448 We restore into the parameters, all of which should be lvalues:
1449 STR -- the saved data position.
1450 PAT -- the saved pattern position.
1451 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1452 REGSTART, REGEND -- arrays of string positions.
1453 REG_INFO -- array of information about each subexpression.
1455 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1456 `pend', `string1', `size1', `string2', and `size2'. */
1458 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, \
1459 regstart, regend, reg_info) \
1462 const unsigned char *string_temp; \
1463 DEBUG_STATEMENT (fail_stack_elt_t ffailure_id); \
1465 assert (!FAIL_STACK_EMPTY ()); \
1467 /* Remove failure points and point to \
1468 * how many regs pushed. */ \
1469 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1470 DEBUG_PRINT2 (" Before pop, next avail: %lu\n", \
1471 (unsigned long) fail_stack.avail); \
1472 DEBUG_PRINT2 (" size: %lu\n", \
1473 (unsigned long) fail_stack.size); \
1475 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1477 DEBUG_POP (&ffailure_id.integer); \
1478 DEBUG_PRINT2 (" Popping failure id: %u\n", \
1479 * (unsigned int *) &ffailure_id); \
1481 /* If the saved string location is NULL, it \
1482 came from an on_failure_keep_string_jump opcode, \
1483 and we want to throw away the saved NULL, thus \
1484 retaining our current position in the string. */ \
1485 string_temp = (const unsigned char *) \
1486 POP_FAILURE_POINTER (); \
1487 if (string_temp != NULL) \
1488 str = string_temp; \
1490 DEBUG_PRINT2(" Popping string %p: `", str); \
1491 DEBUG_PRINT_DOUBLE_STRING(str, \
1494 DEBUG_PRINT1("'\n"); \
1496 pat = (unsigned char *) POP_FAILURE_POINTER(); \
1497 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1498 DEBUG_PRINT_COMPILED_PATTERN(bufp, pat, pend); \
1500 /* Restore register info. */ \
1501 high_reg = (unsigned) POP_FAILURE_INT (); \
1502 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1504 low_reg = (unsigned) POP_FAILURE_INT (); \
1505 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1507 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) { \
1508 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
1510 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1511 DEBUG_PRINT2 (" info: %p\n", \
1512 ®_info[this_reg]); \
1514 regend[this_reg] = POP_FAILURE_POINTER (); \
1515 DEBUG_PRINT2 (" end: %p\n", \
1516 regend[this_reg]); \
1518 regstart[this_reg] = POP_FAILURE_POINTER (); \
1519 DEBUG_PRINT2 (" start: %p\n", \
1520 regstart[this_reg]); \
1523 set_regs_matched_done = 0; \
1524 DEBUG_STATEMENT (nfailure_points_popped++); \
1525 } while (0) /* POP_FAILURE_POINT */
1527 /* Structure for per-register (a.k.a. per-group) information.
1528 Other register information, such as the
1529 starting and ending positions (which are addresses), and the list of
1530 inner groups (which is a bits list) are maintained in separate
1533 We are making a (strictly speaking) nonportable assumption here: that
1534 the compiler will pack our bit fields into something that fits into
1535 the type of `word', i.e., is something that fits into one item on the
1539 fail_stack_elt_t word;
1541 /* This field is one if this group can match the empty string,
1542 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1543 #define MATCH_NULL_UNSET_VALUE 3
1544 unsigned match_null_string_p:2;
1545 unsigned is_active:1;
1546 unsigned matched_something:1;
1547 unsigned ever_matched_something:1;
1549 } register_info_type;
1551 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1552 #define IS_ACTIVE(R) ((R).bits.is_active)
1553 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1554 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1556 /* Call this when have matched a real character; it sets `matched' flags
1557 for the subexpressions which we are currently inside. Also records
1558 that those subexprs have matched. */
1559 #define SET_REGS_MATCHED() \
1562 if (!set_regs_matched_done) \
1565 set_regs_matched_done = 1; \
1566 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1568 MATCHED_SOMETHING (reg_info[r]) \
1569 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1576 /* Registers are set to a sentinel when they haven't yet matched. */
1577 static unsigned char reg_unset_dummy;
1578 #define REG_UNSET_VALUE (®_unset_dummy)
1579 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1581 /* Subroutine declarations and macros for regex_compile. */
1583 /* Fetch the next character in the uncompiled pattern---translating it
1584 if necessary. Also cast from a signed character in the constant
1585 string passed to us by the user to an unsigned char that we can use
1586 as an array index (in, e.g., `translate'). */
1587 #define PATFETCH(c) \
1590 c = TRANSLATE (c); \
1593 /* Fetch the next character in the uncompiled pattern, with no
1595 #define PATFETCH_RAW(c) \
1596 do {if (p == pend) return REG_EEND; \
1597 assert (p < pend); \
1598 c = charptr_emchar (p); \
1602 /* Go backwards one character in the pattern. */
1603 #define PATUNFETCH DEC_CHARPTR (p)
1607 #define PATFETCH_EXTENDED(emch) \
1608 do {if (p == pend) return REG_EEND; \
1609 assert (p < pend); \
1610 emch = charptr_emchar ((const Bufbyte *) p); \
1612 if (TRANSLATE_P (translate) && emch < 0x80) \
1613 emch = (Emchar) (unsigned char) RE_TRANSLATE (emch); \
1616 #define PATFETCH_RAW_EXTENDED(emch) \
1617 do {if (p == pend) return REG_EEND; \
1618 assert (p < pend); \
1619 emch = charptr_emchar ((const Bufbyte *) p); \
1623 #define PATUNFETCH_EXTENDED DEC_CHARPTR (p)
1625 #define PATFETCH_EITHER(emch) \
1627 if (has_extended_chars) \
1628 PATFETCH_EXTENDED (emch); \
1633 #define PATFETCH_RAW_EITHER(emch) \
1635 if (has_extended_chars) \
1636 PATFETCH_RAW_EXTENDED (emch); \
1638 PATFETCH_RAW (emch); \
1641 #define PATUNFETCH_EITHER \
1643 if (has_extended_chars) \
1644 PATUNFETCH_EXTENDED (emch); \
1646 PATUNFETCH (emch); \
1649 #else /* not MULE */
1651 #define PATFETCH_EITHER(emch) PATFETCH (emch)
1652 #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch)
1653 #define PATUNFETCH_EITHER PATUNFETCH
1657 /* If `translate' is non-null, return translate[D], else just D. We
1658 cast the subscript to translate because some data is declared as
1659 `char *', to avoid warnings when a string constant is passed. But
1660 when we use a character as a subscript we must make it unsigned. */
1661 #define TRANSLATE(d) (TRANSLATE_P (translate) ? RE_TRANSLATE (d) : (d))
1663 /* Macros for outputting the compiled pattern into `buffer'. */
1665 /* If the buffer isn't allocated when it comes in, use this. */
1666 #define INIT_BUF_SIZE 32
1668 /* Make sure we have at least N more bytes of space in buffer. */
1669 #define GET_BUFFER_SPACE(n) \
1670 while (buf_end - bufp->buffer + (n) > (ptrdiff_t) bufp->allocated) \
1673 /* Make sure we have one more byte of buffer space and then add C to it. */
1674 #define BUF_PUSH(c) \
1676 GET_BUFFER_SPACE (1); \
1677 *buf_end++ = (unsigned char) (c); \
1680 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1681 #define BUF_PUSH_2(c1, c2) \
1683 GET_BUFFER_SPACE (2); \
1684 *buf_end++ = (unsigned char) (c1); \
1685 *buf_end++ = (unsigned char) (c2); \
1688 /* As with BUF_PUSH_2, except for three bytes. */
1689 #define BUF_PUSH_3(c1, c2, c3) \
1691 GET_BUFFER_SPACE (3); \
1692 *buf_end++ = (unsigned char) (c1); \
1693 *buf_end++ = (unsigned char) (c2); \
1694 *buf_end++ = (unsigned char) (c3); \
1697 /* Store a jump with opcode OP at LOC to location TO. We store a
1698 relative address offset by the three bytes the jump itself occupies. */
1699 #define STORE_JUMP(op, loc, to) \
1700 store_op1 (op, loc, (to) - (loc) - 3)
1702 /* Likewise, for a two-argument jump. */
1703 #define STORE_JUMP2(op, loc, to, arg) \
1704 store_op2 (op, loc, (to) - (loc) - 3, arg)
1706 /* Like `STORE_JUMP', but for inserting. Assume `buf_end' is the
1708 #define INSERT_JUMP(op, loc, to) \
1709 insert_op1 (op, loc, (to) - (loc) - 3, buf_end)
1711 /* Like `STORE_JUMP2', but for inserting. Assume `buf_end' is the
1713 #define INSERT_JUMP2(op, loc, to, arg) \
1714 insert_op2 (op, loc, (to) - (loc) - 3, arg, buf_end)
1716 /* This is not an arbitrary limit: the arguments which represent offsets
1717 into the pattern are two bytes long. So if 2^16 bytes turns out to
1718 be too small, many things would have to change. */
1719 #define MAX_BUF_SIZE (1L << 16)
1721 /* Extend the buffer by twice its current size via realloc and
1722 reset the pointers that pointed into the old block to point to the
1723 correct places in the new one. If extending the buffer results in it
1724 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1725 #define EXTEND_BUFFER() \
1727 re_char *old_buffer = bufp->buffer; \
1728 if (bufp->allocated == MAX_BUF_SIZE) \
1730 bufp->allocated <<= 1; \
1731 if (bufp->allocated > MAX_BUF_SIZE) \
1732 bufp->allocated = MAX_BUF_SIZE; \
1733 bufp->buffer = (unsigned char *)xrealloc( \
1734 bufp->buffer, bufp->allocated); \
1735 if (bufp->buffer == NULL) { \
1736 return REG_ESPACE; \
1738 /* If the buffer moved, move all the pointers into it. */ \
1739 if (old_buffer != bufp->buffer) { \
1740 buf_end = (buf_end - old_buffer) + bufp->buffer; \
1741 begalt = (begalt - old_buffer) + bufp->buffer; \
1742 if (fixup_alt_jump) { \
1744 (fixup_alt_jump - old_buffer) + \
1748 laststart = (laststart - old_buffer) + \
1751 if (pending_exact) { \
1753 (pending_exact - old_buffer) + \
1759 /* Since we have one byte reserved for the register number argument to
1760 {start,stop}_memory, the maximum number of groups we can report
1761 things about is what fits in that byte. */
1762 #define MAX_REGNUM 255
1764 /* But patterns can have more than `MAX_REGNUM' registers. We just
1765 ignore the excess. */
1766 typedef unsigned regnum_t;
1768 #define INIT_REG_TRANSLATE_SIZE 5
1770 /* Macros for the compile stack. */
1772 /* Since offsets can go either forwards or backwards, this type needs to
1773 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1774 typedef int pattern_offset_t;
1777 pattern_offset_t begalt_offset;
1778 pattern_offset_t fixup_alt_jump;
1779 pattern_offset_t inner_group_offset;
1780 pattern_offset_t laststart_offset;
1782 } compile_stack_elt_t;
1785 compile_stack_elt_t *stack;
1787 unsigned avail; /* Offset of next open position. */
1788 } compile_stack_type;
1790 #define INIT_COMPILE_STACK_SIZE 32
1792 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1793 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1795 /* The next available element. */
1796 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1798 /* Set the bit for character C in a bit vector. */
1799 #define SET_LIST_BIT(c) \
1800 (buf_end[((unsigned char) (c)) / BYTEWIDTH] \
1801 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1805 /* Set the "bit" for character C in a range table. */
1806 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt)
1808 /* Set the "bit" for character c in the appropriate table. */
1809 #define SET_EITHER_BIT(c) \
1811 if (has_extended_chars) \
1812 SET_RANGETAB_BIT (c); \
1817 #else /* not MULE */
1819 #define SET_EITHER_BIT(c) SET_LIST_BIT (c)
1823 /* Get the next unsigned number in the uncompiled pattern. */
1824 #define GET_UNSIGNED_NUMBER(num) \
1828 while (ISDIGIT (c)) \
1832 num = num * 10 + c - '0'; \
1840 #define CHAR_CLASS_MAX_LENGTH 9 /* Namely, `multibyte'. */
1843 static void store_op1(re_opcode_t op, unsigned char *loc, int arg);
1844 static void store_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1845 static void insert_op1(re_opcode_t op, unsigned char *loc, int arg,
1846 unsigned char *end);
1847 static void insert_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2,
1848 unsigned char *end);
1849 static re_bool at_begline_loc_p(re_char*, re_char*, reg_syntax_t);
1850 static re_bool at_endline_loc_p(re_char*, re_char*, int syntax);
1851 static re_bool group_in_compile_stack(compile_stack_type compile_stack,
1853 static reg_errcode_t compile_range(re_char ** p_ptr, re_char * pend,
1854 RE_TRANSLATE_TYPE translate,
1855 reg_syntax_t syntax, unsigned char *b);
1857 static reg_errcode_t compile_extended_range(re_char ** p_ptr,
1859 RE_TRANSLATE_TYPE translate,
1860 reg_syntax_t syntax,
1863 static re_bool group_match_null_string_p(unsigned char **p,
1865 register_info_type * reg_info);
1866 static re_bool alt_match_null_string_p(unsigned char *p, unsigned char *end,
1867 register_info_type * reg_info);
1868 static re_bool common_op_match_null_string_p(unsigned char **p,
1870 register_info_type * reg_info);
1871 static int bcmp_translate(const unsigned char *s1, const unsigned char *s2,
1872 REGISTER int len, RE_TRANSLATE_TYPE translate);
1873 static int re_match_2_internal(struct re_pattern_buffer *bufp,
1874 re_char * string1, int size1,
1875 re_char * string2, int size2, int pos,
1876 struct re_registers *regs, int stop);
1878 #ifndef MATCH_MAY_ALLOCATE
1880 /* If we cannot allocate large objects within re_match_2_internal,
1881 we make the fail stack and register vectors global.
1882 The fail stack, we grow to the maximum size when a regexp
1884 The register vectors, we adjust in size each time we
1885 compile a regexp, according to the number of registers it needs. */
1887 static fail_stack_type fail_stack;
1889 /* Size with which the following vectors are currently allocated.
1890 That is so we can make them bigger as needed,
1891 but never make them smaller. */
1892 static int regs_allocated_size;
1894 static re_char **regstart, **regend;
1895 static re_char **old_regstart, **old_regend;
1896 static re_char **best_regstart, **best_regend;
1897 static register_info_type *reg_info;
1898 static re_char **reg_dummy;
1899 static register_info_type *reg_info_dummy;
1901 /* Make the register vectors big enough for NUM_REGS registers,
1902 but don't make them smaller. */
1904 static void regex_grow_registers(int num_regs)
1906 if (num_regs > regs_allocated_size) {
1907 RETALLOC_IF(regstart, num_regs, re_char *);
1908 RETALLOC_IF(regend, num_regs, re_char *);
1909 RETALLOC_IF(old_regstart, num_regs, re_char *);
1910 RETALLOC_IF(old_regend, num_regs, re_char *);
1911 RETALLOC_IF(best_regstart, num_regs, re_char *);
1912 RETALLOC_IF(best_regend, num_regs, re_char *);
1913 RETALLOC_IF(reg_info, num_regs, register_info_type);
1914 RETALLOC_IF(reg_dummy, num_regs, re_char *);
1915 RETALLOC_IF(reg_info_dummy, num_regs, register_info_type);
1917 regs_allocated_size = num_regs;
1921 #endif /* not MATCH_MAY_ALLOCATE */
1923 /* auxiliary stuff, FSF theft */
1924 /* Character classes. */
1927 RECC_ALNUM, RECC_ALPHA, RECC_WORD,
1928 RECC_GRAPH, RECC_PRINT,
1929 RECC_LOWER, RECC_UPPER,
1930 RECC_PUNCT, RECC_CNTRL,
1931 RECC_DIGIT, RECC_XDIGIT,
1932 RECC_BLANK, RECC_SPACE,
1933 RECC_MULTIBYTE, RECC_NONASCII,
1934 RECC_ASCII, RECC_UNIBYTE
1937 /* Map a string to the char class it names (if any). */
1938 static inline re_wctype_t
1939 re_wctype(re_char *str)
1941 const char *string = (const char*)str;
1943 if (STREQ (string, "alnum")) {
1945 } else if (STREQ (string, "alpha")) {
1947 } else if (STREQ (string, "digit")) {
1949 } else if (STREQ (string, "graph")) {
1951 } else if (STREQ (string, "space")) {
1953 } else if (STREQ (string, "upper")) {
1955 } else if (STREQ (string, "lower")) {
1957 } else if (STREQ (string, "word")) {
1959 } else if (STREQ (string, "print")) {
1961 } else if (STREQ (string, "punct")) {
1963 } else if (STREQ (string, "xdigit")) {
1965 } else if (STREQ (string, "cntrl")) {
1967 } else if (STREQ (string, "blank")) {
1969 } else if (STREQ (string, "ascii")) {
1971 } else if (STREQ (string, "nonascii")) {
1972 return RECC_NONASCII;
1973 } else if (STREQ (string, "unibyte")) {
1974 return RECC_UNIBYTE;
1975 } else if (STREQ (string, "multibyte")) {
1976 return RECC_MULTIBYTE;
1982 /* True if CH is in the char class CC. */
1984 re_iswctype(int ch, re_wctype_t cc)
1988 return ISALNUM (ch);
1990 return ISALPHA (ch);
1992 return ISBLANK (ch);
1994 return ISCNTRL (ch);
1996 return ISDIGIT (ch);
1998 return ISGRAPH (ch);
2000 return ISLOWER (ch);
2002 return ISPRINT (ch);
2004 return ISPUNCT (ch);
2006 return ISSPACE (ch);
2008 return ISUPPER (ch);
2010 return ISXDIGIT (ch);
2012 return IS_REAL_ASCII (ch);
2014 return !IS_REAL_ASCII (ch);
2018 return ISUNIBYTE((unsigned int)ch);
2019 case RECC_MULTIBYTE:
2020 return !ISUNIBYTE((unsigned int)ch);
2030 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2031 Returns one of error codes defined in `regex.h', or zero for success.
2033 Assumes the `allocated' (and perhaps `buffer') and `translate'
2034 fields are set in BUFP on entry.
2036 If it succeeds, results are put in BUFP (if it returns an error, the
2037 contents of BUFP are undefined):
2038 `buffer' is the compiled pattern;
2039 `syntax' is set to SYNTAX;
2040 `used' is set to the length of the compiled pattern;
2041 `fastmap_accurate' is zero;
2042 `re_ngroups' is the number of groups/subexpressions (including shy
2044 `re_nsub' is the number of non-shy groups in PATTERN;
2045 `not_bol' and `not_eol' are zero;
2047 The `fastmap' and `newline_anchor' fields are neither
2048 examined nor set. */
2050 static reg_errcode_t
2051 regex_compile(re_char * pattern, int size, reg_syntax_t syntax,
2052 struct re_pattern_buffer *bufp)
2054 /* We fetch characters from PATTERN here. We declare these as int
2055 (or possibly long) so that chars above 127 can be used as
2056 array indices. The macros that fetch a character from the pattern
2057 make sure to coerce to unsigned char before assigning, so we won't
2058 get bitten by negative numbers here. */
2059 /* XEmacs change: used to be unsigned char. */
2060 REGISTER EMACS_INT c, c1;
2062 /* A random temporary spot in PATTERN. */
2065 /* Points to the end of the buffer, where we should append. */
2066 REGISTER unsigned char *buf_end;
2068 /* Keeps track of unclosed groups. */
2069 compile_stack_type compile_stack;
2071 /* Points to the current (ending) position in the pattern. */
2072 re_char *p = pattern;
2073 re_char *pend = pattern + size;
2075 /* How to translate the characters in the pattern. */
2076 RE_TRANSLATE_TYPE translate = bufp->translate;
2078 /* Address of the count-byte of the most recently inserted `exactn'
2079 command. This makes it possible to tell if a new exact-match
2080 character can be added to that command or if the character requires
2081 a new `exactn' command. */
2082 unsigned char *pending_exact = 0;
2084 /* Address of start of the most recently finished expression.
2085 This tells, e.g., postfix * where to find the start of its
2086 operand. Reset at the beginning of groups and alternatives. */
2087 unsigned char *laststart = 0;
2089 /* Address of beginning of regexp, or inside of last group. */
2090 unsigned char *begalt;
2092 /* Place in the uncompiled pattern (i.e., the {) to
2093 which to go back if the interval is invalid. */
2094 re_char *beg_interval;
2096 /* Address of the place where a forward jump should go to the end of
2097 the containing expression. Each alternative of an `or' -- except the
2098 last -- ends with a forward jump of this sort. */
2099 unsigned char *fixup_alt_jump = 0;
2101 /* Counts open-groups as they are encountered. Remembered for the
2102 matching close-group on the compile stack, so the same register
2103 number is put in the stop_memory as the start_memory. */
2104 regnum_t regnum = 0;
2106 /* we need to close over compile_stack */
2107 # define free_stack(args...) xfree(compile_stack.stack)
2109 #ifdef REGEX_DEBUG_FLAG
2110 DEBUG_PRINT1("\nCompiling pattern: ");
2114 for (debug_count = 0; debug_count < size; debug_count++)
2115 putchar(pattern[debug_count]);
2120 /* Initialize the compile stack. */
2121 compile_stack.stack =
2122 TALLOC(INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2123 if (compile_stack.stack == NULL) {
2126 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2127 compile_stack.avail = 0;
2129 /* Initialize the pattern buffer. */
2130 bufp->syntax = syntax;
2131 bufp->fastmap_accurate = 0;
2132 bufp->not_bol = bufp->not_eol = 0;
2134 /* Set `used' to zero, so that if we return an error, the pattern
2135 printer (for debugging) will think there's no pattern. We reset it
2139 /* Always count groups, whether or not bufp->no_sub is set. */
2141 bufp->re_ngroups = 0;
2143 /* Allocate index translation array if needed. */
2144 if (bufp->external_to_internal_register == 0) {
2145 bufp->external_to_internal_register_size =
2146 INIT_REG_TRANSLATE_SIZE;
2147 RETALLOC(bufp->external_to_internal_register,
2148 bufp->external_to_internal_register_size, int);
2150 /* Initialize translations to impossible value to aid debugging. */
2154 bufp->external_to_internal_register[0] = 0;
2155 for (i = 1; i < bufp->external_to_internal_register_size; i++)
2156 bufp->external_to_internal_register[i] =
2160 #if !defined (emacs) && !defined (SYNTAX_TABLE)
2161 /* Initialize the syntax table. */
2165 if (bufp->allocated == 0) {
2167 /* If zero allocated, but buffer is non-null, try to
2168 realloc enough space. This loses if buffer's address
2169 is bogus, but that is the user's responsibility. */
2170 RETALLOC(bufp->buffer, INIT_BUF_SIZE, unsigned char);
2172 /* Caller did not allocate a buffer. Do it for them. */
2173 bufp->buffer = TALLOC(INIT_BUF_SIZE, unsigned char);
2175 if (!bufp->buffer) {
2180 bufp->allocated = INIT_BUF_SIZE;
2183 begalt = buf_end = bufp->buffer;
2185 /* Loop through the uncompiled pattern until we're at the end. */
2191 if ( /* If at start of pattern, it's an operator. */
2193 /* If context independent, it's an operator. */
2194 || syntax & RE_CONTEXT_INDEP_ANCHORS
2195 /* Otherwise, depends on what's come before. */
2196 || at_begline_loc_p(pattern, p, syntax))
2204 if ( /* If at end of pattern, it's an
2207 /* If context independent, it's an
2209 || syntax & RE_CONTEXT_INDEP_ANCHORS
2210 /* Otherwise, depends on what's
2212 || at_endline_loc_p(p, pend, syntax))
2221 if ((syntax & RE_BK_PLUS_QM) ||
2222 (syntax & RE_LIMITED_OPS)) {
2228 re_bool zero_times_ok;
2229 re_bool many_times_ok;
2232 /* If there is no previous pattern... */
2234 if (syntax & RE_CONTEXT_INVALID_OPS) {
2237 } else if (!(syntax & RE_CONTEXT_INDEP_OPS)) {
2242 /* true means zero/many matches are allowed. */
2243 zero_times_ok = c != '+';
2244 many_times_ok = c != '?';
2246 /* true means match shortest string possible. */
2249 /* If there is a sequence of repetition chars, collapse it
2250 down to just one (the right one). We can't combine
2251 interval operators with these because of, e.g., `a{2}*',
2252 which should only match an even number of `a's. */
2257 || (!(syntax & RE_BK_PLUS_QM)
2258 && (c == '+' || c == '?'))) ;
2260 else if (syntax & RE_BK_PLUS_QM
2268 if (!(c1 == '+' || c1 == '?')) {
2280 /* If we get here, we found another repeat
2282 if (!(syntax & RE_NO_MINIMAL_MATCHING)) {
2283 /* "*?" and "+?" and "??" are okay (and
2284 mean match minimally), but other
2285 sequences (such as "*??" and "+++")
2286 are rejected (reserved for future
2288 if (minimal || c != '?') {
2294 zero_times_ok |= c != '+';
2295 many_times_ok |= c != '?';
2299 /* Star, etc. applied to an empty pattern is equivalent
2300 to an empty pattern. */
2305 /* Now we know whether zero matches is allowed
2306 and whether two or more matches is allowed
2307 and whether we want minimal or maximal matching. */
2309 if (!many_times_ok) {
2311 0: /on_failure_jump to 6
2316 GET_BUFFER_SPACE(6);
2317 INSERT_JUMP(jump, laststart,
2320 INSERT_JUMP(on_failure_jump,
2324 } else if (zero_times_ok) {
2328 6: /on_failure_jump to 3
2331 GET_BUFFER_SPACE(6);
2332 INSERT_JUMP(jump, laststart,
2335 STORE_JUMP(on_failure_jump,
2342 3: /on_failure_jump to 0
2345 GET_BUFFER_SPACE(3);
2346 STORE_JUMP(on_failure_jump,
2347 buf_end, laststart);
2351 /* Are we optimizing this jump? */
2352 re_bool keep_string_p = false;
2354 if (many_times_ok) {
2355 /* More than one repetition is allowed,
2356 so put in at the end a backward
2357 relative jump from `buf_end' to
2358 before the next jump we're going to
2359 put in below (which jumps from
2360 laststart to after this jump).
2362 But if we are at the `*' in the exact
2363 sequence `.*\n', insert an
2364 unconditional jump backwards to the
2365 ., instead of the beginning of the
2366 loop. This way we only push a
2367 failure point once, instead of every
2368 time through the loop. */
2369 assert(p - 1 > pattern);
2371 /* Allocate the space for the jump. */
2372 GET_BUFFER_SPACE(3);
2374 /* We know we are not at the first
2375 character of the pattern, because
2376 laststart was nonzero. And we've
2377 already incremented `p', by the way,
2378 to be the character after the `*'.
2379 Do we have to do something analogous
2380 here for null bytes, because of
2382 if (*(p - 2) == '.' &&
2386 !(syntax & RE_DOT_NEWLINE)) {
2391 keep_string_p = true;
2393 /* Anything else. */
2399 /* We've added more stuff to the
2404 /* On failure, jump from laststart to buf_end +
2405 3, which will be the end of the buffer after
2406 this jump is inserted. */
2407 GET_BUFFER_SPACE(3);
2408 INSERT_JUMP(keep_string_p ?
2409 on_failure_keep_string_jump
2411 laststart, buf_end + 3);
2414 if (!zero_times_ok) {
2415 /* At least one repetition is required,
2416 so insert a `dummy_failure_jump'
2417 before the initial `on_failure_jump'
2418 instruction of the loop. This effects
2419 a skip over that instruction the
2420 first time we hit that loop. */
2421 GET_BUFFER_SPACE(3);
2422 INSERT_JUMP(dummy_failure_jump,
2433 laststart = buf_end;
2438 /* XEmacs change: this whole section */
2439 re_bool had_char_class = false;
2441 re_bool has_extended_chars = false;
2442 REGISTER Lisp_Object rtab = Qnil;
2450 /* Ensure that we have enough space to push a charset:
2451 the opcode, the length count, and the bitset; 34
2453 GET_BUFFER_SPACE(34);
2455 laststart = buf_end;
2457 /* We test `*p == '^' twice, instead of using an if
2458 statement, so we only need one BUF_PUSH. */
2459 BUF_PUSH(*p == '^' ? charset_not : charset);
2463 /* Remember the first position in the bracket
2467 /* Push the number of bytes in the bitmap. */
2468 BUF_PUSH((1 << BYTEWIDTH) / BYTEWIDTH);
2470 /* Clear the whole map. */
2471 memset(buf_end, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2473 /* charset_not matches newline according to a syntax
2475 if ((re_opcode_t) buf_end[-2] == charset_not
2476 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2480 start_over_with_extended:
2481 if (has_extended_chars) {
2482 /* There are extended chars here, which means we
2483 need to start over and shift to unified
2484 range-table format. */
2485 if (buf_end[-2] == charset)
2486 buf_end[-2] = charset_mule;
2488 buf_end[-2] = charset_mule_not;
2491 /* go back to the beginning of the charset,
2492 after a possible ^. */
2493 rtab = Vthe_lisp_rangetab;
2494 Fclear_range_table(rtab);
2496 /* charset_not matches newline according to a
2498 if ((re_opcode_t) buf_end[-1] ==
2501 RE_HAT_LISTS_NOT_NEWLINE))
2502 SET_EITHER_BIT('\n');
2506 /* Read in characters and ranges, setting map bits. */
2515 if (c >= 0x80 && !has_extended_chars) {
2516 has_extended_chars = 1;
2517 /* Frumble-bumble, we've found some
2518 extended chars. Need to start over,
2519 process everything using the general
2520 extended-char mechanism, and need to
2521 use charset_mule and charset_mule_not
2522 instead of charset and
2524 goto start_over_with_extended;
2527 /* \ might escape characters inside [...] and
2529 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) &&
2539 && !has_extended_chars) {
2540 has_extended_chars = 1;
2541 goto start_over_with_extended;
2548 /* Could be the end of the bracket
2549 expression. If it's not (i.e., when
2550 the bracket expression is `[]' so
2551 far), the ']' character bit gets set
2553 if (c == ']' && p != p1 + 1)
2556 /* Look ahead to see if it's a range
2557 when the last thing was a character
2559 if (had_char_class && c == '-'
2565 /* Look ahead to see if it's a range
2566 when the last thing was a character:
2567 if this is a hyphen not at the
2568 beginning or the end of a list, then
2569 it's the range operator. */
2571 && !(p - 2 >= pattern
2573 && !(p - 3 >= pattern
2580 if (*(const unsigned char *)p >= 0x80
2581 && !has_extended_chars) {
2582 has_extended_chars = 1;
2583 goto start_over_with_extended;
2585 if (has_extended_chars)
2586 ret = compile_extended_range
2587 (&p, pend, translate,
2592 (&p, pend, translate,
2594 if (ret != REG_NOERROR) {
2600 else if (p[0] == '-' && p[1] != ']') {
2601 /* This handles ranges made up of
2605 /* Move past the `-'. */
2609 if (*(const unsigned char*)p >= 0x80
2610 && !has_extended_chars) {
2611 has_extended_chars = 1;
2612 goto start_over_with_extended;
2614 if (has_extended_chars)
2615 ret = compile_extended_range(
2616 &p, pend, translate,
2620 ret = compile_range(
2621 &p, pend, translate,
2623 if (ret != REG_NOERROR) {
2629 /* See if we're at the beginning of a possible
2632 else if (syntax & RE_CHAR_CLASSES &&
2633 c == '[' && *p == ':') {
2634 /* Leave room for the null. */
2635 char str[CHAR_CLASS_MAX_LENGTH + 1];
2640 /* If pattern is `[[:'. */
2646 /* #### This code is unused.
2647 Correctness is not checked
2648 after TRT table change. */
2650 if (c == ':' || c == ']'
2653 CHAR_CLASS_MAX_LENGTH)
2655 str[c1++] = (char)c;
2659 /* If isn't a word bracketed by
2660 `[:' and `:]': undo the
2661 ending character, the
2662 letters, and leave the
2663 leading `:' and `[' (but set
2665 if (c == ':' && *p == ']') {
2671 if (wct == RECC_ERROR) {
2676 /* Throw away the ] at
2696 re_iswctype(ch, wct)) {
2700 if (re_iswctype(ch, wct)) {
2705 had_char_class = true;
2711 SET_EITHER_BIT('[');
2712 SET_EITHER_BIT(':');
2713 had_char_class = false;
2716 had_char_class = false;
2722 if (has_extended_chars) {
2723 /* We have a range table, not a bit vector. */
2725 unified_range_table_bytes_needed
2727 GET_BUFFER_SPACE(bytes_needed);
2728 unified_range_table_copy_data(rtab,
2731 unified_range_table_bytes_used
2736 /* Discard any (non)matching list bytes that are
2737 all 0 at the end of the map. Decrease the
2738 map-length byte too. */
2739 while ((int)buf_end[-1] > 0
2740 && buf_end[buf_end[-1] - 1] == 0)
2742 buf_end += buf_end[-1];
2747 if (syntax & RE_NO_BK_PARENS)
2753 if (syntax & RE_NO_BK_PARENS)
2759 if (syntax & RE_NEWLINE_ALT)
2765 if (syntax & RE_NO_BK_VBAR)
2771 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2772 goto handle_interval;
2782 /* Do not translate the character after the \, so that we can
2783 distinguish, e.g., \B from \b, even if we normally would
2784 translate, e.g., B to b. */
2789 if (syntax & RE_NO_BK_PARENS)
2790 goto normal_backslash;
2797 if (!(syntax & RE_NO_SHY_GROUPS)
2798 && p != pend && *p == '?') {
2819 /* Record the translation from capturing group index to
2820 register number, reallocating table as needed. */
2823 external_to_internal_register_size
2828 external_to_internal_register_size;
2830 external_to_internal_register_size
2833 external_to_internal_register,
2835 external_to_internal_register_size,
2841 external_to_internal_register_size;
2844 external_to_internal_register
2851 external_to_internal_register
2856 if (COMPILE_STACK_FULL) {
2857 RETALLOC(compile_stack.stack,
2860 compile_stack_elt_t);
2861 if (compile_stack.stack == NULL)
2864 compile_stack.size <<= 1;
2867 /* These are the values to restore when
2868 we hit end of this group. They are
2869 all relative offsets, so that if the
2870 whole pattern moves because of
2871 realloc, they will still be
2873 COMPILE_STACK_TOP.begalt_offset =
2874 begalt - bufp->buffer;
2875 COMPILE_STACK_TOP.fixup_alt_jump =
2880 COMPILE_STACK_TOP.laststart_offset =
2881 buf_end - bufp->buffer;
2882 COMPILE_STACK_TOP.regnum = r;
2884 /* We will eventually replace the 0 with
2885 the number of groups inner to this
2886 one. But do not push a start_memory
2887 for groups beyond the last one we can
2888 represent in the compiled pattern.
2889 #### bad bad bad. this will fail in
2890 lots of ways, if we ever have to
2891 backtrack for these groups.
2893 if (r <= MAX_REGNUM) {
2895 inner_group_offset =
2896 buf_end - bufp->buffer +
2898 BUF_PUSH_3(start_memory, r, 0);
2901 compile_stack.avail++;
2906 /* If we've reached MAX_REGNUM groups,
2907 then this open won't actually
2908 generate any code, so we'll have to
2909 clear pending_exact explicitly. */
2915 if (syntax & RE_NO_BK_PARENS)
2916 goto normal_backslash;
2918 if (COMPILE_STACK_EMPTY) {
2920 RE_UNMATCHED_RIGHT_PAREN_ORD) {
2921 goto normal_backslash;
2929 if (fixup_alt_jump) {
2930 /* Push a dummy failure point at the end
2931 of the alternative for a possible
2932 future `pop_failure_jump' to pop.
2933 See comments at `push_dummy_failure'
2935 BUF_PUSH(push_dummy_failure);
2937 /* We allocated space for this jump when
2938 we assigned to `fixup_alt_jump', in
2939 the `handle_alt' case below. */
2940 STORE_JUMP(jump_past_alt,
2941 fixup_alt_jump, buf_end - 1);
2944 /* See similar code for backslashed left paren
2946 if (COMPILE_STACK_EMPTY) {
2948 RE_UNMATCHED_RIGHT_PAREN_ORD) {
2956 /* Since we just checked for an empty stack
2957 above, this ``can't happen''. */
2958 assert(compile_stack.avail != 0);
2960 /* We don't just want to restore into
2961 `regnum', because later groups should
2962 continue to be numbered higher, as in
2963 `(ab)c(de)' -- the second group is
2965 regnum_t this_group_regnum;
2967 compile_stack.avail--;
2970 COMPILE_STACK_TOP.begalt_offset;
2983 COMPILE_STACK_TOP.regnum;
2984 /* If we've reached MAX_REGNUM groups,
2985 then this open won't actually
2986 generate any code, so we'll have to
2987 clear pending_exact explicitly. */
2990 /* We're at the end of the group, so now
2991 we know how many groups were inside
2993 if (this_group_regnum <= MAX_REGNUM) {
2994 unsigned char *inner_group_loc
3003 BUF_PUSH_3(stop_memory,
3011 case '|': /* `\|'. */
3012 if (syntax & RE_LIMITED_OPS
3013 || syntax & RE_NO_BK_VBAR) {
3014 goto normal_backslash;
3017 if (syntax & RE_LIMITED_OPS) {
3021 /* Insert before the previous alternative a jump
3022 which jumps to this alternative if the former
3024 GET_BUFFER_SPACE(3);
3025 INSERT_JUMP(on_failure_jump, begalt,
3030 /* The alternative before this one has a jump after it
3031 which gets executed if it gets matched. Adjust that
3032 jump so it will jump to this alternative's analogous
3033 jump (put in below, which in turn will jump to the next
3034 (if any) alternative's such jump, etc.). The last such
3035 jump jumps to the correct final destination. A picture:
3041 If we are at `b', then fixup_alt_jump right now points to a
3042 three-byte space after `a'. We'll put in the jump, set
3043 fixup_alt_jump to right after `b', and leave behind three
3044 bytes which we'll fill in when we get to after `c'. */
3047 STORE_JUMP(jump_past_alt,
3048 fixup_alt_jump, buf_end);
3050 /* Mark and leave space for a jump after this alternative,
3051 to be filled in later either by next alternative or
3052 when know we're at the end of a series of alternatives. */
3053 fixup_alt_jump = buf_end;
3054 GET_BUFFER_SPACE(3);
3062 /* If \{ is a literal. */
3063 if (!(syntax & RE_INTERVALS)
3064 /* If we're at `\{' and it's not the open-interval
3066 || ((syntax & RE_INTERVALS)
3067 && (syntax & RE_NO_BK_BRACES))
3068 || (p - 2 == pattern && p == pend))
3069 goto normal_backslash;
3073 /* If got here, then the syntax allows
3076 /* At least (most) this many matches
3078 int lower_bound = -1, upper_bound = -1;
3080 beg_interval = p - 1;
3083 if (syntax & RE_NO_BK_BRACES) {
3084 goto unfetch_interval;
3091 GET_UNSIGNED_NUMBER(lower_bound);
3094 GET_UNSIGNED_NUMBER(
3096 if (upper_bound < 0) {
3101 /* Interval such as `{1}' =>
3102 match exactly once. */
3103 upper_bound = lower_bound;
3107 || upper_bound > RE_DUP_MAX
3108 || lower_bound > upper_bound) {
3109 if (syntax & RE_NO_BK_BRACES) {
3110 goto unfetch_interval;
3117 if (!(syntax & RE_NO_BK_BRACES)) {
3126 if (syntax & RE_NO_BK_BRACES) {
3127 goto unfetch_interval;
3134 /* We just parsed a valid interval. */
3136 /* If it's invalid to have no preceding
3140 RE_CONTEXT_INVALID_OPS) {
3145 RE_CONTEXT_INDEP_OPS) {
3146 laststart = buf_end;
3148 goto unfetch_interval;
3152 /* If the upper bound is zero, don't
3153 want to succeed at all; jump from
3154 `laststart' to `b + 3', which will be
3155 the end of the buffer after we insert
3157 if (upper_bound == 0) {
3158 GET_BUFFER_SPACE(3);
3159 INSERT_JUMP(jump, laststart,
3164 /* Otherwise, we have a nontrivial interval. When
3165 we're all done, the pattern will look like:
3166 set_number_at <jump count> <upper bound>
3167 set_number_at <succeed_n count> <lower bound>
3168 succeed_n <after jump addr> <succeed_n count>
3170 jump_n <succeed_n addr> <jump count>
3171 (The upper bound and `jump_n' are omitted if
3172 `upper_bound' is 1, though.) */
3173 else { /* If the upper bound is > 1, we need to insert
3174 more at the end of the loop. */
3175 Memory_count nbytes =