1 /* Extended regular expression matching and search library,
2 version 0.12, extended for XEmacs.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
6 Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
7 Copyright (C) 1995 Sun Microsystems, Inc.
8 Copyright (C) 1995 Ben Wing.
10 This file is part of SXEmacs
12 SXEmacs is free software: you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation, either version 3 of the License, or
15 (at your option) any later version.
17 SXEmacs is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program. If not, see <http://www.gnu.org/licenses/>. */
26 /* Synched up with: FSF 19.29. */
28 /* Changes made for XEmacs:
30 (1) the REGEX_BEGLINE_CHECK code from the XEmacs v18 regex routines
31 was added. This causes a huge speedup in font-locking.
32 (2) Rel-alloc is disabled when the MMAP version of rel-alloc is
33 being used, because it's too slow -- all those calls to mmap()
34 add humongous overhead.
35 (3) Lots and lots of changes for Mule. They are bracketed by
36 `#ifdef MULE' or with comments that have `XEmacs' in them.
43 #ifndef REGISTER /* Rigidly enforced as of 20.3 */
52 /* Converts the pointer to the char to BEG-based offset from the start. */
53 #define PTR_TO_OFFSET(d) (MATCHING_IN_FIRST_STRING \
54 ? (d) - string1 : (d) - (string2 - size1))
56 #define PTR_TO_OFFSET(d) 0
59 /* We assume non-Mule if emacs isn't defined. */
64 /* We need this for `regex.h', and perhaps for the Emacs include files. */
65 #include <sys/types.h>
67 /* This is for other GNU distributions with internationalized messages. */
68 #if defined (I18N3) && (defined (HAVE_LIBINTL_H) || defined (_LIBC))
71 # define gettext(msgid) (msgid)
74 /* XEmacs: define this to add in a speedup for patterns anchored at
75 the beginning of a line. Keep the ifdefs so that it's easier to
76 tell where/why this code has diverged from v19. */
77 #define REGEX_BEGLINE_CHECK
79 /* XEmacs: the current mmap-based ralloc handles small blocks very
80 poorly, so we disable it here. */
82 #if (defined (REL_ALLOC) && defined (HAVE_MMAP)) || defined(DOUG_LEA_MALLOC)
86 /* The `emacs' switch turns on certain matching commands
87 that make sense only in Emacs. */
96 Lisp_Object Vthe_lisp_rangetab;
98 void complex_vars_of_regex(void)
100 Vthe_lisp_rangetab = Fmake_range_table();
101 staticpro(&Vthe_lisp_rangetab);
106 void complex_vars_of_regex(void)
112 #define RE_TRANSLATE(ch) TRT_TABLE_OF (translate, (Emchar) ch)
113 #define TRANSLATE_P(tr) (!NILP (tr) && CHAR_TABLEP(tr))
115 /* just massage that xfree thing */
117 #if defined HAVE_BDWGC && defined EF_USE_BDWGC
125 /* If we are not linking with Emacs proper,
126 we can't use the relocating allocator
127 even if config.h says that we can. */
130 #if defined (STDC_HEADERS) || defined (_LIBC)
132 # include <stdbool.h>
138 /* Types normally included via lisp.h */
139 #include <stddef.h> /* for ptrdiff_t */
142 #ifndef DECLARE_NOTHING
143 #define DECLARE_NOTHING struct nosuchstruct
149 #define charptr_emchar(str) ((Emchar) (str)[0])
151 #define INC_CHARPTR(p) ((p)++)
152 #define DEC_CHARPTR(p) ((p)--)
156 /* Define the syntax stuff for \<, \>, etc. */
158 /* This must be nonzero for the wordchar and notwordchar pattern
159 commands in re_match_2. */
166 extern char *re_syntax_table;
168 #else /* not SYNTAX_TABLE */
170 /* How many characters in the character set. */
171 #define CHAR_SET_SIZE 256
173 static char re_syntax_table[CHAR_SET_SIZE];
175 static void init_syntax_once(void)
180 const char *word_syntax_chars =
181 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
183 memset(re_syntax_table, 0, sizeof(re_syntax_table));
185 while (*word_syntax_chars)
186 re_syntax_table[(unsigned int)(*word_syntax_chars++)] =
193 #endif /* SYNTAX_TABLE */
195 #define SYNTAX_UNSAFE(ignored, c) re_syntax_table[c]
196 #undef SYNTAX_FROM_CACHE
197 #define SYNTAX_FROM_CACHE SYNTAX_UNSAFE
199 #define RE_TRANSLATE(c) translate[(unsigned char) (c)]
200 #define TRANSLATE_P(tr) tr
202 #if defined HAVE_BDWGC && defined EF_USE_BDWGC
203 # if defined HAVE_GC_GC_H
205 # elif defined HAVE_GC_H
208 # define xmalloc GC_MALLOC
209 # define xmalloc_atomic GC_MALLOC_ATOMIC
210 # define xrealloc GC_REALLOC
211 # define xfree GC_FREE
212 # define xstrdup GC_STRDUP
213 #else /* !HAVE_BDWGC */
214 # define xmalloc malloc
215 # define xmalloc_atomic malloc
216 # define xrealloc realloc
218 # define xstrdup strdup
219 #endif /* HAVE_BDWGC */
224 /* Under XEmacs, this is needed because we don't define it elsewhere. */
225 #ifdef SWITCH_ENUM_BUG
226 #define SWITCH_ENUM_CAST(x) ((int)(x))
228 #define SWITCH_ENUM_CAST(x) (x)
231 /* Get the interface, including the syntax bits. */
234 /* isalpha etc. are used for the character classes. */
237 /* Jim Meyering writes:
239 "... Some ctype macros are valid only for character codes that
240 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
241 using /bin/cc or gcc but without giving an ansi option). So, all
242 ctype uses should be through macros like ISPRINT... If
243 STDC_HEADERS is defined, then autoconf has verified that the ctype
244 macros don't need to be guarded with references to isascii. ...
245 Defining isascii to 1 should let any compiler worth its salt
246 eliminate the && through constant folding." */
248 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
249 #define ISASCII_1(c) 1
251 #define ISASCII_1(c) isascii(c)
255 /* The IS*() macros can be passed any character, including an extended
256 one. We need to make sure there are no crashes, which would occur
257 otherwise due to out-of-bounds array references. */
258 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c))
260 #define ISASCII(c) ISASCII_1 (c)
264 #define ISBLANK(c) (ISASCII (c) && isblank (c))
266 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
269 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
271 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
274 /* stolen from FSF/src/charset.h */
275 #define SINGLE_BYTE_CHAR_P(c) (((unsigned int)(c) & 0xFF) == (c))
276 /* 1 if C is an ASCII character. */
277 #define IS_REAL_ASCII(c) ((c) < 128)
278 /* 1 if C is a unibyte character. */
279 #define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P(c))
281 #define ISPRINT(c) (ISASCII (c) && isprint (c))
282 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
283 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
284 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
285 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
286 #define ISLOWER(c) (ISASCII (c) && islower (c))
287 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
288 #define ISSPACE(c) (ISASCII (c) && isspace (c))
289 #define ISUPPER(c) (ISASCII (c) && isupper (c))
290 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
295 __attribute__((always_inline));
299 Lisp_Object ct = regex_emacs_buffer->mirror_syntax_table;
301 return (SYNTAX(XCHAR_TABLE(ct), c) == Sword);
304 # define ISWORD(c) (ISALPHA(c))
308 #define NULL (void *)0
311 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
312 since ours (we hope) works properly with all combinations of
313 machines, compilers, `char' and `unsigned char' argument types.
314 (Per Bothner suggested the basic approach.) */
315 #undef SIGN_EXTEND_CHAR
317 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
318 #else /* not __STDC__ */
319 /* As in Harbison and Steele. */
320 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
323 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
324 use `alloca' instead of `malloc'. This is because using malloc in
325 re_search* or re_match* could cause memory leaks when C-g is used in
326 Emacs; also, malloc is slower and causes storage fragmentation. On
327 the other hand, malloc is more portable, and easier to debug.
329 Because we sometimes use alloca, some routines have to be macros,
330 not functions -- `alloca'-allocated space disappears at the end of the
331 function it is called in. */
335 #define REGEX_ALLOCATE xmalloc
336 #define REGEX_REALLOCATE(source, osize, nsize) xrealloc(source, nsize)
337 #define REGEX_FREE xfree
339 #else /* !REGEX_MALLOC */
341 /* Emacs already defines alloca, sometimes. */
344 /* Make alloca work the best possible way. */
346 #define alloca __builtin_alloca
347 #else /* not __GNUC__ */
350 #else /* not __GNUC__ or HAVE_ALLOCA_H */
351 #ifndef _AIX /* Already did AIX, up at the top. */
353 #endif /* not _AIX */
354 #endif /* HAVE_ALLOCA_H */
355 #endif /* __GNUC__ */
357 #endif /* not alloca */
359 #define REGEX_ALLOCATE alloca
361 /* Assumes a `char *destination' variable. */
362 #define REGEX_REALLOCATE(source, osize, nsize) \
363 (destination = (char *) alloca (nsize), \
364 memmove (destination, source, osize), \
367 /* No need to do anything to free, after alloca. */
368 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
370 #endif /* REGEX_MALLOC */
372 /* Define how to allocate the failure stack. */
376 #define REGEX_ALLOCATE_STACK(size) \
377 r_alloc ((char **) &failure_stack_ptr, (size))
378 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
379 r_re_alloc ((char **) &failure_stack_ptr, (nsize))
380 #define REGEX_FREE_STACK(ptr) \
381 r_alloc_free ((void **) &failure_stack_ptr)
383 #else /* !REL_ALLOC */
387 #define REGEX_ALLOCATE_STACK xmalloc
388 #define REGEX_REALLOCATE_STACK(source, osize, nsize) xrealloc(source, nsize)
389 #define REGEX_FREE_STACK xfree
391 #else /* !REGEX_MALLOC */
393 #define REGEX_ALLOCATE_STACK alloca
395 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
396 REGEX_REALLOCATE (source, osize, nsize)
397 /* No need to explicitly free anything. */
398 #define REGEX_FREE_STACK(arg)
400 #endif /* REGEX_MALLOC */
401 #endif /* REL_ALLOC */
403 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
404 `string1' or just past its end. This works if PTR is NULL, which is
406 #define FIRST_STRING_P(ptr) \
407 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
409 /* (Re)Allocate N items of type T using malloc, or fail. */
410 #define TALLOC(n, t) \
411 ((t *)xmalloc((n) * sizeof (t)))
412 #define TALLOC_ATOMIC(n, t) \
413 ((t *)xmalloc_atomic((n) * sizeof (t)))
414 #define RETALLOC(addr, n, t) \
415 ((addr) = (t *)xrealloc(addr, (n) * sizeof (t)))
416 #define RETALLOC_IF(addr, n, t) \
418 RETALLOC((addr), (n), t); \
420 (addr) = TALLOC ((n), t)
421 #define REGEX_TALLOC(n, t) \
422 ((t *)REGEX_ALLOCATE((n) * sizeof (t)))
424 #define BYTEWIDTH 8 /* In bits. */
426 #define STREQ(s1, s2) (strcmp (s1, s2) == 0)
430 #define MAX(a, b) ((a) > (b) ? (a) : (b))
431 #define MIN(a, b) ((a) < (b) ? (a) : (b))
433 /* Type of source-pattern and string chars. */
434 typedef const unsigned char re_char;
436 typedef char re_bool;
440 /* These are the command codes that appear in compiled regular
441 expressions. Some opcodes are followed by argument bytes. A
442 command code can specify any interpretation whatsoever for its
443 arguments. Zero bytes may appear in the compiled regular expression. */
448 /* Succeed right away--no more backtracking. */
451 /* Followed by one byte giving n, then by n literal bytes. */
454 /* Matches any (more or less) character. */
457 /* Matches any one char belonging to specified set. First
458 following byte is number of bitmap bytes. Then come bytes
459 for a bitmap saying which chars are in. Bits in each byte
460 are ordered low-bit-first. A character is in the set if its
461 bit is 1. A character too large to have a bit in the map is
462 automatically not in the set. */
465 /* Same parameters as charset, but match any character that is
466 not one of those specified. */
469 /* Start remembering the text that is matched, for storing in a
470 register. Followed by one byte with the register number, in
471 the range 1 to the pattern buffer's re_ngroups
472 field. Then followed by one byte with the number of groups
473 inner to this one. (This last has to be part of the
474 start_memory only because we need it in the on_failure_jump
478 /* Stop remembering the text that is matched and store it in a
479 memory register. Followed by one byte with the register
480 number, in the range 1 to `re_ngroups' in the
481 pattern buffer, and one byte with the number of inner groups,
482 just like `start_memory'. (We need the number of inner
483 groups here because we don't have any easy way of finding the
484 corresponding start_memory when we're at a stop_memory.) */
487 /* Match a duplicate of something remembered. Followed by one
488 byte containing the register number. */
491 /* Fail unless at beginning of line. */
494 /* Fail unless at end of line. */
497 /* Succeeds if at beginning of buffer (if emacs) or at beginning
498 of string to be matched (if not). */
501 /* Analogously, for end of buffer/string. */
504 /* Followed by two byte relative address to which to jump. */
507 /* Same as jump, but marks the end of an alternative. */
510 /* Followed by two-byte relative address of place to resume at
511 in case of failure. */
514 /* Like on_failure_jump, but pushes a placeholder instead of the
515 current string position when executed. */
516 on_failure_keep_string_jump,
518 /* Throw away latest failure point and then jump to following
519 two-byte relative address. */
522 /* Change to pop_failure_jump if know won't have to backtrack to
523 match; otherwise change to jump. This is used to jump
524 back to the beginning of a repeat. If what follows this jump
525 clearly won't match what the repeat does, such that we can be
526 sure that there is no use backtracking out of repetitions
527 already matched, then we change it to a pop_failure_jump.
528 Followed by two-byte address. */
531 /* Jump to following two-byte address, and push a dummy failure
532 point. This failure point will be thrown away if an attempt
533 is made to use it for a failure. A `+' construct makes this
534 before the first repeat. Also used as an intermediary kind
535 of jump when compiling an alternative. */
538 /* Push a dummy failure point and continue. Used at the end of
542 /* Followed by two-byte relative address and two-byte number n.
543 After matching N times, jump to the address upon failure. */
546 /* Followed by two-byte relative address, and two-byte number n.
547 Jump to the address N times, then fail. */
550 /* Set the following two-byte relative address to the
551 subsequent two-byte number. The address *includes* the two
555 wordchar, /* Matches any word-constituent character. */
556 notwordchar, /* Matches any char that is not a word-constituent. */
558 wordbeg, /* Succeeds if at word beginning. */
559 wordend, /* Succeeds if at word end. */
561 wordbound, /* Succeeds if at a word boundary. */
562 notwordbound /* Succeeds if not at a word boundary. */
564 , before_dot, /* Succeeds if before point. */
565 at_dot, /* Succeeds if at point. */
566 after_dot, /* Succeeds if after point. */
568 /* Matches any character whose syntax is specified. Followed by
569 a byte which contains a syntax code, e.g., Sword. */
572 /* Matches any character whose syntax is not that specified. */
576 /* need extra stuff to be able to properly work with XEmacs/Mule
577 characters (which may take up more than one byte) */
578 , charset_mule, /* Matches any character belonging to specified set.
579 The set is stored in "unified range-table
580 format"; see rangetab.c. Unlike the `charset'
581 opcode, this can handle arbitrary characters. */
583 charset_mule_not /* Same parameters as charset_mule, but match any
584 character that is not one of those specified. */
585 /* 97/2/17 jhod: The following two were merged back in from the Mule
586 2.3 code to enable some language specific processing */
587 , categoryspec, /* Matches entries in the character category tables */
588 notcategoryspec /* The opposite of the above */
592 /* Common operations on the compiled pattern. */
594 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
596 #define STORE_NUMBER(destination, number) \
598 (destination)[0] = (number) & 0377; \
599 (destination)[1] = (number) >> 8; \
602 /* Same as STORE_NUMBER, except increment DESTINATION to
603 the byte after where the number is stored. Therefore, DESTINATION
604 must be an lvalue. */
606 #define STORE_NUMBER_AND_INCR(destination, number) \
608 STORE_NUMBER (destination, number); \
609 (destination) += 2; \
612 /* Put into DESTINATION a number stored in two contiguous bytes starting
615 #define EXTRACT_NUMBER(destination, source) \
617 (destination) = *(source) & 0377; \
618 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
621 #ifdef REGEX_DEBUG_FLAG
622 static void extract_number(int *dest, re_char * source)
624 int temp = SIGN_EXTEND_CHAR(*(source + 1));
625 *dest = *source & 0377;
629 #ifndef EXTRACT_MACROS /* To debug the macros. */
630 #undef EXTRACT_NUMBER
631 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
632 #endif /* not EXTRACT_MACROS */
634 #endif /* REGEX_DEBUG_FLAG */
636 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
637 SOURCE must be an lvalue. */
639 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
641 EXTRACT_NUMBER (destination, source); \
645 #ifdef REGEX_DEBUG_FLAG
647 extract_number_and_incr(int *destination, unsigned char **source)
649 extract_number(destination, *source);
653 #ifndef EXTRACT_MACROS
654 #undef EXTRACT_NUMBER_AND_INCR
655 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
656 extract_number_and_incr (&dest, &src)
657 #endif /* not EXTRACT_MACROS */
659 #endif /* REGEX_DEBUG_FLAG */
661 /* If REGEX_DEBUG_FLAG is defined, Regex prints many voluminous messages about
662 what it is doing (if the variable `debug' is nonzero). If linked with the
663 main program in `iregex.c', you can enter patterns and strings interactively.
664 And if linked with the main program in `main.c' and the other test files, you
665 can run the already-written tests. */
667 #if defined (REGEX_DEBUG_FLAG)
669 /* We use standard I/O for debugging. */
673 /* XEmacs provides its own version of assert() */
674 /* It is useful to test things that ``must'' be true when debugging. */
679 /* no point in having this one, since we do not offer any accessor to this */
680 static int debug = 0;
683 #define DEBUG_STATEMENT(e) e
684 #define DEBUG_PRINT1(x) printf(x)
685 #define DEBUG_PRINT2(x1, x2) printf(x1, x2)
686 #define DEBUG_PRINT3(x1, x2, x3) printf(x1, x2, x3)
687 #define DEBUG_PRINT4(x1, x2, x3, x4) printf(x1, x2, x3, x4)
688 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
689 print_partial_compiled_pattern(s, e)
690 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
691 print_double_string (w, s1, sz1, s2, sz2)
693 /* Print the fastmap in human-readable form. */
695 static void print_fastmap(char *fastmap)
697 unsigned was_a_range = 0;
700 while (i < (1 << BYTEWIDTH)) {
704 while (i < (1 << BYTEWIDTH) && fastmap[i]) {
717 /* Print a compiled pattern string in human-readable form, starting at
718 the START pointer into it and ending just before the pointer END. */
720 /* can't help it ... the ugly mule code changes `start' upon alignment,
721 * so this can't have the const qualifier ... bugger */
723 print_partial_compiled_pattern(unsigned char *start, re_char *end)
726 unsigned char *p = start;
734 /* Loop over pattern commands. */
736 printf("%ld:\t", (long)(p - start));
738 switch ((const re_opcode_t)*p++) {
745 printf("/exactn/%d", mcnt);
755 printf("/start_memory/%d/%d", mcnt, *p++);
760 printf("/stop_memory/%d/%d", mcnt, *p++);
764 printf("/duplicate/%d", *p++);
773 REGISTER int c, last = -100;
774 REGISTER int in_range = 0;
776 printf("/charset [%s",
777 (re_opcode_t) * (p - 1) == charset_not
780 assert(p + *p < pend);
782 for (c = 0; c < 256; c++)
783 if (((unsigned char)(c / 8) < *p)
786 /* Are we starting a range? */
787 if (last + 1 == c && !in_range) {
791 /* Have we broken a range? */
792 else if (last + 1 != c
815 case charset_mule_not: {
818 printf("/charset_mule [%s",
819 (re_opcode_t) * (p - 1) == charset_mule_not
821 /* u_r_t_nentries() CHANGES its arg, why on earth
822 * is this marked const here then? :O -hrop */
823 nentries = unified_range_table_nentries(p);
824 for (i = 0; i < nentries; i++) {
825 EMACS_INT first, last;
826 Lisp_Object dummy_val;
828 /* u_r_t_get_range CHANGES its arg ...
829 * p can't be const thence */
830 unified_range_table_get_range(p, i,
837 printf("(0x%lx)", (long)first);
848 p += unified_range_table_bytes_used(p);
861 case on_failure_jump:
862 extract_number_and_incr(&mcnt, &p);
863 printf("/on_failure_jump to %ld",
864 (long)(p + mcnt - start));
867 case on_failure_keep_string_jump:
868 extract_number_and_incr(&mcnt, &p);
869 printf("/on_failure_keep_string_jump to %ld",
870 (long)(p + mcnt - start));
873 case dummy_failure_jump:
874 extract_number_and_incr(&mcnt, &p);
875 printf("/dummy_failure_jump to %ld",
876 (long)(p + mcnt - start));
879 case push_dummy_failure:
880 printf("/push_dummy_failure");
884 extract_number_and_incr(&mcnt, &p);
885 printf("/maybe_pop_jump to %ld",
886 (long)(p + mcnt - start));
889 case pop_failure_jump:
890 extract_number_and_incr(&mcnt, &p);
891 printf("/pop_failure_jump to %ld",
892 (long)(p + mcnt - start));
896 extract_number_and_incr(&mcnt, &p);
897 printf("/jump_past_alt to %ld",
898 (long)(p + mcnt - start));
902 extract_number_and_incr(&mcnt, &p);
903 printf("/jump to %ld", (long)(p + mcnt - start));
907 extract_number_and_incr(&mcnt, &p);
908 extract_number_and_incr(&mcnt2, &p);
909 printf("/succeed_n to %ld, %d times",
910 (long)(p + mcnt - start), mcnt2);
914 extract_number_and_incr(&mcnt, &p);
915 extract_number_and_incr(&mcnt2, &p);
916 printf("/jump_n to %ld, %d times",
917 (long)(p + mcnt - start), mcnt2);
921 extract_number_and_incr(&mcnt, &p);
922 extract_number_and_incr(&mcnt2, &p);
923 printf("/set_number_at location %ld to %d",
924 (long)(p + mcnt - start), mcnt2);
928 printf("/wordbound");
932 printf("/notwordbound");
944 printf("/before_dot");
952 printf("/after_dot");
956 printf("/syntaxspec");
962 printf("/notsyntaxspec");
968 /* 97/2/17 jhod Mule category patch */
970 printf("/categoryspec");
975 case notcategoryspec:
976 printf("/notcategoryspec");
980 /* end of category patch */
989 printf("/notwordchar");
1002 printf("?%d", *(p - 1));
1008 printf("%ld:\tend of pattern.\n", (long)(p - start));
1012 print_compiled_pattern(struct re_pattern_buffer *bufp)
1016 re_char *buffer = bufp->buffer;
1018 unsigned char *buffer = bufp->buffer;
1021 print_partial_compiled_pattern(buffer, buffer + bufp->used);
1022 printf("%ld bytes used/%ld bytes allocated.\n", bufp->used,
1025 if (bufp->fastmap_accurate && bufp->fastmap) {
1026 printf("fastmap: ");
1027 print_fastmap(bufp->fastmap);
1030 printf("re_nsub: %ld\t", (long)bufp->re_nsub);
1031 printf("re_ngroups: %ld\t", (long)bufp->re_ngroups);
1032 printf("regs_alloc: %d\t", bufp->regs_allocated);
1033 printf("can_be_null: %d\t", bufp->can_be_null);
1034 printf("newline_anchor: %d\n", bufp->newline_anchor);
1035 printf("no_sub: %d\t", bufp->no_sub);
1036 printf("not_bol: %d\t", bufp->not_bol);
1037 printf("not_eol: %d\t", bufp->not_eol);
1038 printf("syntax: %d\n", bufp->syntax);
1039 /* Perhaps we should print the translate table? */
1040 /* and maybe the category table? */
1042 if (bufp->external_to_internal_register) {
1045 printf("external_to_internal_register:\n");
1046 for (i = 0; i <= bufp->re_nsub; i++) {
1049 printf("%d -> %d", i,
1050 bufp->external_to_internal_register[i]);
1057 print_double_string(re_char * where, re_char * string1, int size1,
1058 re_char * string2, int size2)
1063 Element_count this_char;
1065 if (FIRST_STRING_P(where)) {
1066 for (this_char = where - string1; this_char < size1;
1068 putchar(string1[this_char]);
1073 for (this_char = where - string2; this_char < size2;
1075 putchar(string2[this_char]);
1079 #else /* not REGEX_DEBUG_FLAG */
1084 #define DEBUG_STATEMENT(e)
1085 #define DEBUG_PRINT1(x)
1086 #define DEBUG_PRINT2(x1, x2)
1087 #define DEBUG_PRINT3(x1, x2, x3)
1088 #define DEBUG_PRINT4(x1, x2, x3, x4)
1089 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1090 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1092 #endif /* REGEX_DEBUG_FLAG */
1094 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1095 also be assigned to arbitrarily: each pattern buffer stores its own
1096 syntax, so it can be changed between regex compilations. */
1097 /* This has no initializer because initialized variables in Emacs
1098 become read-only after dumping. */
1099 reg_syntax_t re_syntax_options;
1101 /* Specify the precise syntax of regexps for compilation. This provides
1102 for compatibility for various utilities which historically have
1103 different, incompatible syntaxes.
1105 The argument SYNTAX is a bit mask comprised of the various bits
1106 defined in regex.h. We return the old syntax. */
1108 reg_syntax_t re_set_syntax(reg_syntax_t syntax)
1110 reg_syntax_t ret = re_syntax_options;
1112 re_syntax_options = syntax;
1116 /* This table gives an error message for each of the error codes listed
1117 in regex.h. Obviously the order here has to be same as there.
1118 POSIX doesn't require that we do anything for REG_NOERROR,
1119 but why not be nice? */
1121 static const char *re_error_msgid[] = {
1122 "Success", /* REG_NOERROR */
1123 "No match", /* REG_NOMATCH */
1124 "Invalid regular expression", /* REG_BADPAT */
1125 "Invalid collation character", /* REG_ECOLLATE */
1126 "Invalid character class name", /* REG_ECTYPE */
1127 "Trailing backslash", /* REG_EESCAPE */
1128 "Invalid back reference", /* REG_ESUBREG */
1129 "Unmatched [ or [^", /* REG_EBRACK */
1130 "Unmatched ( or \\(", /* REG_EPAREN */
1131 "Unmatched \\{", /* REG_EBRACE */
1132 "Invalid content of \\{\\}", /* REG_BADBR */
1133 "Invalid range end", /* REG_ERANGE */
1134 "Memory exhausted", /* REG_ESPACE */
1135 "Invalid preceding regular expression", /* REG_BADRPT */
1136 "Premature end of regular expression", /* REG_EEND */
1137 "Regular expression too big", /* REG_ESIZE */
1138 "Unmatched ) or \\)", /* REG_ERPAREN */
1140 "Invalid syntax designator", /* REG_ESYNTAX */
1143 "Ranges may not span charsets", /* REG_ERANGESPAN */
1144 "Invalid category designator", /* REG_ECATEGORY */
1148 /* Avoiding alloca during matching, to placate r_alloc. */
1150 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1151 searching and matching functions should not call alloca. On some
1152 systems, alloca is implemented in terms of malloc, and if we're
1153 using the relocating allocator routines, then malloc could cause a
1154 relocation, which might (if the strings being searched are in the
1155 ralloc heap) shift the data out from underneath the regexp
1158 Here's another reason to avoid allocation: Emacs
1159 processes input from X in a signal handler; processing X input may
1160 call malloc; if input arrives while a matching routine is calling
1161 malloc, then we're scrod. But Emacs can't just block input while
1162 calling matching routines; then we don't notice interrupts when
1163 they come in. So, Emacs blocks input around all regexp calls
1164 except the matching calls, which it leaves unprotected, in the
1165 faith that they will not malloc. */
1167 /* Normally, this is fine. */
1168 #define MATCH_MAY_ALLOCATE
1170 /* When using GNU C, we are not REALLY using the C alloca, no matter
1171 what config.h may say. So don't take precautions for it. */
1176 /* The match routines may not allocate if (1) they would do it with malloc
1177 and (2) it's not safe for them to use malloc.
1178 Note that if REL_ALLOC is defined, matching would not use malloc for the
1179 failure stack, but we would still use it for the register vectors;
1180 so REL_ALLOC should not affect this. */
1181 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1182 #undef MATCH_MAY_ALLOCATE
1185 /* Failure stack declarations and macros; both re_compile_fastmap and
1186 re_match_2 use a failure stack. These have to be macros because of
1187 REGEX_ALLOCATE_STACK. */
1189 /* Number of failure points for which to initially allocate space
1190 when matching. If this number is exceeded, we allocate more
1191 space, so it is not a hard limit. */
1192 #ifndef INIT_FAILURE_ALLOC
1193 #define INIT_FAILURE_ALLOC 20
1196 /* Roughly the maximum number of failure points on the stack. Would be
1197 exactly that if always used MAX_FAILURE_SPACE each time we failed.
1198 This is a variable only so users of regex can assign to it; we never
1199 change it ourselves. */
1200 #if defined (MATCH_MAY_ALLOCATE) || defined (REGEX_MALLOC)
1201 /* 4400 was enough to cause a crash on Alpha OSF/1,
1202 whose default stack limit is 2mb. */
1203 int re_max_failures = 50000;
1205 int re_max_failures = 2000;
1208 union fail_stack_elt {
1209 const unsigned char *pointer;
1213 typedef union fail_stack_elt fail_stack_elt_t;
1216 fail_stack_elt_t *stack;
1218 Element_count avail; /* Offset of next open position. */
1221 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1222 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1223 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1225 /* Define macros to initialize and free the failure stack.
1226 Do `return -2' if the alloc fails. */
1228 #ifdef MATCH_MAY_ALLOCATE
1229 #define INIT_FAIL_STACK() \
1231 fail_stack.stack = (fail_stack_elt_t *) \
1232 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * \
1233 sizeof (fail_stack_elt_t)); \
1235 if (fail_stack.stack == NULL) \
1238 fail_stack.size = INIT_FAILURE_ALLOC; \
1239 fail_stack.avail = 0; \
1242 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1244 #define INIT_FAIL_STACK() \
1246 fail_stack.avail = 0; \
1249 #define RESET_FAIL_STACK()
1252 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1254 Return 1 if succeeds, and 0 if either ran out of memory
1255 allocating space for it or it was already too large.
1257 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1259 #define DOUBLE_FAIL_STACK(fail_stack) \
1260 ((int) (fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
1262 : ((fail_stack).stack = (fail_stack_elt_t *) \
1263 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1264 (fail_stack).size * sizeof (fail_stack_elt_t), \
1265 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1267 (fail_stack).stack == NULL \
1269 : ((fail_stack).size <<= 1, \
1272 /* Push pointer POINTER on FAIL_STACK.
1273 Return 1 if was able to do so and 0 if ran out of memory allocating
1275 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1276 ((FAIL_STACK_FULL () \
1277 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1279 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1282 /* Push a pointer value onto the failure stack.
1283 Assumes the variable `fail_stack'. Probably should only
1284 be called from within `PUSH_FAILURE_POINT'. */
1285 #define PUSH_FAILURE_POINTER(item) \
1286 fail_stack.stack[fail_stack.avail++].pointer = \
1287 (const unsigned char*)(item)
1289 /* This pushes an integer-valued item onto the failure stack.
1290 Assumes the variable `fail_stack'. Probably should only
1291 be called from within `PUSH_FAILURE_POINT'. */
1292 #define PUSH_FAILURE_INT(item) \
1293 fail_stack.stack[fail_stack.avail++].integer = (item)
1295 /* Push a fail_stack_elt_t value onto the failure stack.
1296 Assumes the variable `fail_stack'. Probably should only
1297 be called from within `PUSH_FAILURE_POINT'. */
1298 #define PUSH_FAILURE_ELT(item) \
1299 fail_stack.stack[fail_stack.avail++] = (item)
1301 /* These three POP... operations complement the three PUSH... operations.
1302 All assume that `fail_stack' is nonempty. */
1303 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1304 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1305 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1307 /* Used to omit pushing failure point id's when we're not debugging. */
1308 #ifdef REGEX_DEBUG_FLAG
1309 #define DEBUG_PUSH PUSH_FAILURE_INT
1310 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1312 #define DEBUG_PUSH(item)
1313 #define DEBUG_POP(item_addr)
1316 /* Push the information about the state we will need
1317 if we ever fail back to it.
1319 Requires variables fail_stack, regstart, regend, reg_info, and
1320 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
1323 Does `return FAILURE_CODE' if runs out of memory. */
1325 #if !defined (REGEX_MALLOC) && !defined (REL_ALLOC)
1326 #define DECLARE_DESTINATION char *destination
1328 #define DECLARE_DESTINATION DECLARE_NOTHING
1331 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1333 DECLARE_DESTINATION; \
1334 /* Must be int, so when we don't save any registers, the\
1335 arithmetic \ of 0 + -1 isn't done as unsigned. */ \
1338 DEBUG_STATEMENT (failure_id++); \
1339 DEBUG_STATEMENT (nfailure_points_pushed++); \
1340 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1341 DEBUG_PRINT2 (" Before push, next avail: %lu\n", \
1342 (long unsigned int)(fail_stack).avail); \
1343 DEBUG_PRINT2 (" size: %lu\n", \
1344 (long unsigned int)(fail_stack).size); \
1346 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
1347 DEBUG_PRINT2 (" available: %ld\n", \
1348 (long) REMAINING_AVAIL_SLOTS); \
1350 /* Ensure we have enough space allocated for what \
1351 * we will push. */ \
1352 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) { \
1353 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1354 return failure_code; \
1356 DEBUG_PRINT2 ("\n Doubled stack; size now: %lu\n", \
1357 (unsigned long) (fail_stack).size); \
1358 DEBUG_PRINT2 (" slots available: %ld\n", \
1359 (long) REMAINING_AVAIL_SLOTS); \
1362 /* Push the info, starting with the registers. */ \
1363 DEBUG_PRINT1 ("\n"); \
1365 for (this_reg = lowest_active_reg; \
1366 this_reg <= highest_active_reg; \
1368 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1369 DEBUG_STATEMENT (num_regs_pushed++); \
1371 DEBUG_PRINT2(" start: %p\n", \
1372 regstart[this_reg]); \
1373 PUSH_FAILURE_POINTER(regstart[this_reg]); \
1375 DEBUG_PRINT2 (" end: %p\n", \
1376 regend[this_reg]); \
1377 PUSH_FAILURE_POINTER(regend[this_reg]); \
1379 DEBUG_PRINT2 (" info: 0x%p\n ", \
1380 ®_info[this_reg]); \
1381 DEBUG_PRINT2 (" match_null=%d", \
1382 REG_MATCH_NULL_STRING_P \
1383 (reg_info[this_reg])); \
1384 DEBUG_PRINT2 (" active=%d", \
1385 IS_ACTIVE (reg_info[this_reg])); \
1386 DEBUG_PRINT2 (" matched_something=%d", \
1388 (reg_info[this_reg])); \
1389 DEBUG_PRINT2 (" ever_matched_something=%d", \
1390 EVER_MATCHED_SOMETHING \
1391 (reg_info[this_reg])); \
1392 DEBUG_PRINT1 ("\n"); \
1393 PUSH_FAILURE_ELT(reg_info[this_reg].word); \
1396 DEBUG_PRINT2 (" Pushing low active reg: %d\n", \
1397 lowest_active_reg); \
1398 PUSH_FAILURE_INT (lowest_active_reg); \
1400 DEBUG_PRINT2 (" Pushing high active reg: %d\n", \
1401 highest_active_reg); \
1402 PUSH_FAILURE_INT (highest_active_reg); \
1404 DEBUG_PRINT2 (" Pushing pattern 0x%lx: \n", \
1405 (long) pattern_place); \
1406 DEBUG_PRINT_COMPILED_PATTERN(bufp, pattern_place, pend); \
1407 PUSH_FAILURE_POINTER (pattern_place); \
1409 DEBUG_PRINT2(" Pushing string %p: `", \
1411 DEBUG_PRINT_DOUBLE_STRING(string_place, \
1414 DEBUG_PRINT1("'\n"); \
1415 PUSH_FAILURE_POINTER (string_place); \
1417 DEBUG_PRINT2(" Pushing failure id: %u\n", failure_id); \
1418 DEBUG_PUSH(failure_id); \
1421 /* This is the number of items that are pushed and popped on the stack
1422 for each register. */
1423 #define NUM_REG_ITEMS 3
1425 /* Individual items aside from the registers. */
1427 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1429 #define NUM_NONREG_ITEMS 4
1432 /* We push at most this many items on the stack. */
1433 /* We used to use (num_regs - 1), which is the number of registers
1434 this regexp will save; but that was changed to 5
1435 to avoid stack overflow for a regexp with lots of parens. */
1436 #define MAX_FAILURE_ITEMS (5 * (NUM_REG_ITEMS + NUM_NONREG_ITEMS))
1438 /* We actually push this many items. */
1439 #define NUM_FAILURE_ITEMS \
1440 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
1443 /* How many items can still be added to the stack without overflowing it. */
1444 #define REMAINING_AVAIL_SLOTS ((int) ((fail_stack).size - (fail_stack).avail))
1446 /* Pops what PUSH_FAIL_STACK pushes.
1448 We restore into the parameters, all of which should be lvalues:
1449 STR -- the saved data position.
1450 PAT -- the saved pattern position.
1451 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1452 REGSTART, REGEND -- arrays of string positions.
1453 REG_INFO -- array of information about each subexpression.
1455 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1456 `pend', `string1', `size1', `string2', and `size2'. */
1458 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, \
1459 regstart, regend, reg_info) \
1461 DEBUG_STATEMENT (fail_stack_elt_t ffailure_id;) \
1463 const unsigned char *string_temp; \
1465 assert (!FAIL_STACK_EMPTY ()); \
1467 /* Remove failure points and point to \
1468 * how many regs pushed. */ \
1469 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1470 DEBUG_PRINT2 (" Before pop, next avail: %lu\n", \
1471 (unsigned long) fail_stack.avail); \
1472 DEBUG_PRINT2 (" size: %lu\n", \
1473 (unsigned long) fail_stack.size); \
1475 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1477 DEBUG_POP (&ffailure_id.integer); \
1478 DEBUG_PRINT2 (" Popping failure id: %u\n", \
1479 * (unsigned int *) &ffailure_id); \
1481 /* If the saved string location is NULL, it \
1482 came from an on_failure_keep_string_jump opcode, \
1483 and we want to throw away the saved NULL, thus \
1484 retaining our current position in the string. */ \
1485 string_temp = POP_FAILURE_POINTER (); \
1486 if (string_temp != NULL) \
1487 str = string_temp; \
1489 DEBUG_PRINT2(" Popping string %p: `", str); \
1490 DEBUG_PRINT_DOUBLE_STRING(str, \
1493 DEBUG_PRINT1("'\n"); \
1495 pat = (unsigned char *) POP_FAILURE_POINTER(); \
1496 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1497 DEBUG_PRINT_COMPILED_PATTERN(bufp, pat, pend); \
1499 /* Restore register info. */ \
1500 high_reg = (unsigned) POP_FAILURE_INT (); \
1501 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1503 low_reg = (unsigned) POP_FAILURE_INT (); \
1504 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1506 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) { \
1507 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
1509 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1510 DEBUG_PRINT2 (" info: %p\n", \
1511 ®_info[this_reg]); \
1513 regend[this_reg] = POP_FAILURE_POINTER (); \
1514 DEBUG_PRINT2 (" end: %p\n", \
1515 regend[this_reg]); \
1517 regstart[this_reg] = POP_FAILURE_POINTER (); \
1518 DEBUG_PRINT2 (" start: %p\n", \
1519 regstart[this_reg]); \
1522 set_regs_matched_done = 0; \
1523 DEBUG_STATEMENT (nfailure_points_popped++); \
1524 } while (0) /* POP_FAILURE_POINT */
1526 /* Structure for per-register (a.k.a. per-group) information.
1527 Other register information, such as the
1528 starting and ending positions (which are addresses), and the list of
1529 inner groups (which is a bits list) are maintained in separate
1532 We are making a (strictly speaking) nonportable assumption here: that
1533 the compiler will pack our bit fields into something that fits into
1534 the type of `word', i.e., is something that fits into one item on the
1538 fail_stack_elt_t word;
1540 /* This field is one if this group can match the empty string,
1541 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1542 #define MATCH_NULL_UNSET_VALUE 3
1543 unsigned match_null_string_p:2;
1544 unsigned is_active:1;
1545 unsigned matched_something:1;
1546 unsigned ever_matched_something:1;
1548 } register_info_type;
1550 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1551 #define IS_ACTIVE(R) ((R).bits.is_active)
1552 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1553 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1555 /* Call this when have matched a real character; it sets `matched' flags
1556 for the subexpressions which we are currently inside. Also records
1557 that those subexprs have matched. */
1558 #define SET_REGS_MATCHED() \
1561 if (!set_regs_matched_done) \
1564 set_regs_matched_done = 1; \
1565 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1567 MATCHED_SOMETHING (reg_info[r]) \
1568 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1575 /* Registers are set to a sentinel when they haven't yet matched. */
1576 static unsigned char reg_unset_dummy;
1577 #define REG_UNSET_VALUE (®_unset_dummy)
1578 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1580 /* Subroutine declarations and macros for regex_compile. */
1582 /* Fetch the next character in the uncompiled pattern---translating it
1583 if necessary. Also cast from a signed character in the constant
1584 string passed to us by the user to an unsigned char that we can use
1585 as an array index (in, e.g., `translate'). */
1586 #define PATFETCH(c) \
1589 c = TRANSLATE (c); \
1592 /* Fetch the next character in the uncompiled pattern, with no
1594 #define PATFETCH_RAW(c) \
1595 do {if (p == pend) return REG_EEND; \
1596 assert (p < pend); \
1597 c = charptr_emchar (p); \
1601 /* Go backwards one character in the pattern. */
1602 #define PATUNFETCH DEC_CHARPTR (p)
1606 #define PATFETCH_EXTENDED(emch) \
1607 do {if (p == pend) return REG_EEND; \
1608 assert (p < pend); \
1609 emch = charptr_emchar ((const Bufbyte *) p); \
1611 if (TRANSLATE_P (translate) && emch < 0x80) \
1612 emch = (Emchar) (unsigned char) RE_TRANSLATE (emch); \
1615 #define PATFETCH_RAW_EXTENDED(emch) \
1616 do {if (p == pend) return REG_EEND; \
1617 assert (p < pend); \
1618 emch = charptr_emchar ((const Bufbyte *) p); \
1622 #define PATUNFETCH_EXTENDED DEC_CHARPTR (p)
1624 #define PATFETCH_EITHER(emch) \
1626 if (has_extended_chars) \
1627 PATFETCH_EXTENDED (emch); \
1632 #define PATFETCH_RAW_EITHER(emch) \
1634 if (has_extended_chars) \
1635 PATFETCH_RAW_EXTENDED (emch); \
1637 PATFETCH_RAW (emch); \
1640 #define PATUNFETCH_EITHER \
1642 if (has_extended_chars) \
1643 PATUNFETCH_EXTENDED (emch); \
1645 PATUNFETCH (emch); \
1648 #else /* not MULE */
1650 #define PATFETCH_EITHER(emch) PATFETCH (emch)
1651 #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch)
1652 #define PATUNFETCH_EITHER PATUNFETCH
1656 /* If `translate' is non-null, return translate[D], else just D. We
1657 cast the subscript to translate because some data is declared as
1658 `char *', to avoid warnings when a string constant is passed. But
1659 when we use a character as a subscript we must make it unsigned. */
1660 #define TRANSLATE(d) (TRANSLATE_P (translate) ? RE_TRANSLATE (d) : (d))
1662 /* Macros for outputting the compiled pattern into `buffer'. */
1664 /* If the buffer isn't allocated when it comes in, use this. */
1665 #define INIT_BUF_SIZE 32
1667 /* Make sure we have at least N more bytes of space in buffer. */
1668 #define GET_BUFFER_SPACE(n) \
1669 while (buf_end - bufp->buffer + (n) > (ptrdiff_t) bufp->allocated) \
1672 /* Make sure we have one more byte of buffer space and then add C to it. */
1673 #define BUF_PUSH(c) \
1675 GET_BUFFER_SPACE (1); \
1676 *buf_end++ = (unsigned char) (c); \
1679 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1680 #define BUF_PUSH_2(c1, c2) \
1682 GET_BUFFER_SPACE (2); \
1683 *buf_end++ = (unsigned char) (c1); \
1684 *buf_end++ = (unsigned char) (c2); \
1687 /* As with BUF_PUSH_2, except for three bytes. */
1688 #define BUF_PUSH_3(c1, c2, c3) \
1690 GET_BUFFER_SPACE (3); \
1691 *buf_end++ = (unsigned char) (c1); \
1692 *buf_end++ = (unsigned char) (c2); \
1693 *buf_end++ = (unsigned char) (c3); \
1696 /* Store a jump with opcode OP at LOC to location TO. We store a
1697 relative address offset by the three bytes the jump itself occupies. */
1698 #define STORE_JUMP(op, loc, to) \
1699 store_op1 (op, loc, (to) - (loc) - 3)
1701 /* Likewise, for a two-argument jump. */
1702 #define STORE_JUMP2(op, loc, to, arg) \
1703 store_op2 (op, loc, (to) - (loc) - 3, arg)
1705 /* Like `STORE_JUMP', but for inserting. Assume `buf_end' is the
1707 #define INSERT_JUMP(op, loc, to) \
1708 insert_op1 (op, loc, (to) - (loc) - 3, buf_end)
1710 /* Like `STORE_JUMP2', but for inserting. Assume `buf_end' is the
1712 #define INSERT_JUMP2(op, loc, to, arg) \
1713 insert_op2 (op, loc, (to) - (loc) - 3, arg, buf_end)
1715 /* This is not an arbitrary limit: the arguments which represent offsets
1716 into the pattern are two bytes long. So if 2^16 bytes turns out to
1717 be too small, many things would have to change. */
1718 #define MAX_BUF_SIZE (1L << 16)
1720 /* Extend the buffer by twice its current size via realloc and
1721 reset the pointers that pointed into the old block to point to the
1722 correct places in the new one. If extending the buffer results in it
1723 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1724 #define EXTEND_BUFFER() \
1726 re_char *old_buffer = bufp->buffer; \
1727 if (bufp->allocated == MAX_BUF_SIZE) \
1729 bufp->allocated <<= 1; \
1730 if (bufp->allocated > MAX_BUF_SIZE) \
1731 bufp->allocated = MAX_BUF_SIZE; \
1732 bufp->buffer = (unsigned char *)xrealloc( \
1733 bufp->buffer, bufp->allocated); \
1734 if (bufp->buffer == NULL) { \
1735 return REG_ESPACE; \
1737 /* If the buffer moved, move all the pointers into it. */ \
1738 if (old_buffer != bufp->buffer) { \
1739 buf_end = (buf_end - old_buffer) + bufp->buffer; \
1740 begalt = (begalt - old_buffer) + bufp->buffer; \
1741 if (fixup_alt_jump) { \
1743 (fixup_alt_jump - old_buffer) + \
1747 laststart = (laststart - old_buffer) + \
1750 if (pending_exact) { \
1752 (pending_exact - old_buffer) + \
1758 /* Since we have one byte reserved for the register number argument to
1759 {start,stop}_memory, the maximum number of groups we can report
1760 things about is what fits in that byte. */
1761 #define MAX_REGNUM 255
1763 /* But patterns can have more than `MAX_REGNUM' registers. We just
1764 ignore the excess. */
1765 typedef unsigned regnum_t;
1767 #define INIT_REG_TRANSLATE_SIZE 5
1769 /* Macros for the compile stack. */
1771 /* Since offsets can go either forwards or backwards, this type needs to
1772 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1773 typedef int pattern_offset_t;
1776 pattern_offset_t begalt_offset;
1777 pattern_offset_t fixup_alt_jump;
1778 pattern_offset_t inner_group_offset;
1779 pattern_offset_t laststart_offset;
1781 } compile_stack_elt_t;
1784 compile_stack_elt_t *stack;
1786 unsigned avail; /* Offset of next open position. */
1787 } compile_stack_type;
1789 #define INIT_COMPILE_STACK_SIZE 32
1791 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1792 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1794 /* The next available element. */
1795 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1797 /* Set the bit for character C in a bit vector. */
1798 #define SET_LIST_BIT(c) \
1799 (buf_end[((unsigned char) (c)) / BYTEWIDTH] \
1800 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1804 /* Set the "bit" for character C in a range table. */
1805 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt)
1807 /* Set the "bit" for character c in the appropriate table. */
1808 #define SET_EITHER_BIT(c) \
1810 if (has_extended_chars) \
1811 SET_RANGETAB_BIT (c); \
1816 #else /* not MULE */
1818 #define SET_EITHER_BIT(c) SET_LIST_BIT (c)
1822 /* Get the next unsigned number in the uncompiled pattern. */
1823 #define GET_UNSIGNED_NUMBER(num) \
1827 while (ISDIGIT (c)) \
1831 num = num * 10 + c - '0'; \
1839 #define CHAR_CLASS_MAX_LENGTH 9 /* Namely, `multibyte'. */
1842 static void store_op1(re_opcode_t op, unsigned char *loc, int arg);
1843 static void store_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1844 static void insert_op1(re_opcode_t op, unsigned char *loc, int arg,
1845 unsigned char *end);
1846 static void insert_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2,
1847 unsigned char *end);
1848 static re_bool at_begline_loc_p(re_char*, re_char*, reg_syntax_t);
1849 static re_bool at_endline_loc_p(re_char*, re_char*, int syntax);
1850 static re_bool group_in_compile_stack(compile_stack_type compile_stack,
1852 static reg_errcode_t compile_range(re_char ** p_ptr, re_char * pend,
1853 RE_TRANSLATE_TYPE translate,
1854 reg_syntax_t syntax, unsigned char *b);
1856 static reg_errcode_t compile_extended_range(re_char ** p_ptr,
1858 RE_TRANSLATE_TYPE translate,
1859 reg_syntax_t syntax,
1862 static re_bool group_match_null_string_p(unsigned char **p,
1864 register_info_type * reg_info);
1865 static re_bool alt_match_null_string_p(unsigned char *p, unsigned char *end,
1866 register_info_type * reg_info);
1867 static re_bool common_op_match_null_string_p(unsigned char **p,
1869 register_info_type * reg_info);
1870 static int bcmp_translate(const unsigned char *s1, const unsigned char *s2,
1871 REGISTER int len, RE_TRANSLATE_TYPE translate);
1872 static int re_match_2_internal(struct re_pattern_buffer *bufp,
1873 re_char * string1, int size1,
1874 re_char * string2, int size2, int pos,
1875 struct re_registers *regs, int stop);
1877 #ifndef MATCH_MAY_ALLOCATE
1879 /* If we cannot allocate large objects within re_match_2_internal,
1880 we make the fail stack and register vectors global.
1881 The fail stack, we grow to the maximum size when a regexp
1883 The register vectors, we adjust in size each time we
1884 compile a regexp, according to the number of registers it needs. */
1886 static fail_stack_type fail_stack;
1888 /* Size with which the following vectors are currently allocated.
1889 That is so we can make them bigger as needed,
1890 but never make them smaller. */
1891 static int regs_allocated_size;
1893 static re_char **regstart, **regend;
1894 static re_char **old_regstart, **old_regend;
1895 static re_char **best_regstart, **best_regend;
1896 static register_info_type *reg_info;
1897 static re_char **reg_dummy;
1898 static register_info_type *reg_info_dummy;
1900 /* Make the register vectors big enough for NUM_REGS registers,
1901 but don't make them smaller. */
1903 static void regex_grow_registers(int num_regs)
1905 if (num_regs > regs_allocated_size) {
1906 RETALLOC_IF(regstart, num_regs, re_char *);
1907 RETALLOC_IF(regend, num_regs, re_char *);
1908 RETALLOC_IF(old_regstart, num_regs, re_char *);
1909 RETALLOC_IF(old_regend, num_regs, re_char *);
1910 RETALLOC_IF(best_regstart, num_regs, re_char *);
1911 RETALLOC_IF(best_regend, num_regs, re_char *);
1912 RETALLOC_IF(reg_info, num_regs, register_info_type);
1913 RETALLOC_IF(reg_dummy, num_regs, re_char *);
1914 RETALLOC_IF(reg_info_dummy, num_regs, register_info_type);
1916 regs_allocated_size = num_regs;
1920 #endif /* not MATCH_MAY_ALLOCATE */
1922 /* auxiliary stuff, FSF theft */
1923 /* Character classes. */
1926 RECC_ALNUM, RECC_ALPHA, RECC_WORD,
1927 RECC_GRAPH, RECC_PRINT,
1928 RECC_LOWER, RECC_UPPER,
1929 RECC_PUNCT, RECC_CNTRL,
1930 RECC_DIGIT, RECC_XDIGIT,
1931 RECC_BLANK, RECC_SPACE,
1932 RECC_MULTIBYTE, RECC_NONASCII,
1933 RECC_ASCII, RECC_UNIBYTE
1936 /* Map a string to the char class it names (if any). */
1937 static inline re_wctype_t
1938 re_wctype(re_char *str)
1940 const char *string = (const char*)str;
1942 if (STREQ (string, "alnum")) {
1944 } else if (STREQ (string, "alpha")) {
1946 } else if (STREQ (string, "digit")) {
1948 } else if (STREQ (string, "graph")) {
1950 } else if (STREQ (string, "space")) {
1952 } else if (STREQ (string, "upper")) {
1954 } else if (STREQ (string, "lower")) {
1956 } else if (STREQ (string, "word")) {
1958 } else if (STREQ (string, "print")) {
1960 } else if (STREQ (string, "punct")) {
1962 } else if (STREQ (string, "xdigit")) {
1964 } else if (STREQ (string, "cntrl")) {
1966 } else if (STREQ (string, "blank")) {
1968 } else if (STREQ (string, "ascii")) {
1970 } else if (STREQ (string, "nonascii")) {
1971 return RECC_NONASCII;
1972 } else if (STREQ (string, "unibyte")) {
1973 return RECC_UNIBYTE;
1974 } else if (STREQ (string, "multibyte")) {
1975 return RECC_MULTIBYTE;
1981 /* True if CH is in the char class CC. */
1983 re_iswctype(int ch, re_wctype_t cc)
1987 return ISALNUM (ch);
1989 return ISALPHA (ch);
1991 return ISBLANK (ch);
1993 return ISCNTRL (ch);
1995 return ISDIGIT (ch);
1997 return ISGRAPH (ch);
1999 return ISLOWER (ch);
2001 return ISPRINT (ch);
2003 return ISPUNCT (ch);
2005 return ISSPACE (ch);
2007 return ISUPPER (ch);
2009 return ISXDIGIT (ch);
2011 return IS_REAL_ASCII (ch);
2013 return !IS_REAL_ASCII (ch);
2017 return ISUNIBYTE((unsigned int)ch);
2018 case RECC_MULTIBYTE:
2019 return !ISUNIBYTE((unsigned int)ch);
2029 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2030 Returns one of error codes defined in `regex.h', or zero for success.
2032 Assumes the `allocated' (and perhaps `buffer') and `translate'
2033 fields are set in BUFP on entry.
2035 If it succeeds, results are put in BUFP (if it returns an error, the
2036 contents of BUFP are undefined):
2037 `buffer' is the compiled pattern;
2038 `syntax' is set to SYNTAX;
2039 `used' is set to the length of the compiled pattern;
2040 `fastmap_accurate' is zero;
2041 `re_ngroups' is the number of groups/subexpressions (including shy
2043 `re_nsub' is the number of non-shy groups in PATTERN;
2044 `not_bol' and `not_eol' are zero;
2046 The `fastmap' and `newline_anchor' fields are neither
2047 examined nor set. */
2049 static reg_errcode_t
2050 regex_compile(re_char * pattern, int size, reg_syntax_t syntax,
2051 struct re_pattern_buffer *bufp)
2053 /* We fetch characters from PATTERN here. We declare these as int
2054 (or possibly long) so that chars above 127 can be used as
2055 array indices. The macros that fetch a character from the pattern
2056 make sure to coerce to unsigned char before assigning, so we won't
2057 get bitten by negative numbers here. */
2058 /* XEmacs change: used to be unsigned char. */
2059 REGISTER EMACS_INT c, c1;
2061 /* A random temporary spot in PATTERN. */
2064 /* Points to the end of the buffer, where we should append. */
2065 REGISTER unsigned char *buf_end;
2067 /* Keeps track of unclosed groups. */
2068 compile_stack_type compile_stack;
2070 /* Points to the current (ending) position in the pattern. */
2071 re_char *p = pattern;
2072 re_char *pend = pattern + size;
2074 /* How to translate the characters in the pattern. */
2075 RE_TRANSLATE_TYPE translate = bufp->translate;
2077 /* Address of the count-byte of the most recently inserted `exactn'
2078 command. This makes it possible to tell if a new exact-match
2079 character can be added to that command or if the character requires
2080 a new `exactn' command. */
2081 unsigned char *pending_exact = 0;
2083 /* Address of start of the most recently finished expression.
2084 This tells, e.g., postfix * where to find the start of its
2085 operand. Reset at the beginning of groups and alternatives. */
2086 unsigned char *laststart = 0;
2088 /* Address of beginning of regexp, or inside of last group. */
2089 unsigned char *begalt;
2091 /* Place in the uncompiled pattern (i.e., the {) to
2092 which to go back if the interval is invalid. */
2093 re_char *beg_interval;
2095 /* Address of the place where a forward jump should go to the end of
2096 the containing expression. Each alternative of an `or' -- except the
2097 last -- ends with a forward jump of this sort. */
2098 unsigned char *fixup_alt_jump = 0;
2100 /* Counts open-groups as they are encountered. Remembered for the
2101 matching close-group on the compile stack, so the same register
2102 number is put in the stop_memory as the start_memory. */
2103 regnum_t regnum = 0;
2104 auto inline void free_stack(void) __attribute__((always_inline));
2106 /* we need to close over compile_stack */
2107 auto inline void free_stack(void)
2109 xfree(compile_stack.stack);
2113 #ifdef REGEX_DEBUG_FLAG
2114 DEBUG_PRINT1("\nCompiling pattern: ");
2118 for (debug_count = 0; debug_count < size; debug_count++)
2119 putchar(pattern[debug_count]);
2124 /* Initialize the compile stack. */
2125 compile_stack.stack =
2126 TALLOC(INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2127 if (compile_stack.stack == NULL) {
2130 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2131 compile_stack.avail = 0;
2133 /* Initialize the pattern buffer. */
2134 bufp->syntax = syntax;
2135 bufp->fastmap_accurate = 0;
2136 bufp->not_bol = bufp->not_eol = 0;
2138 /* Set `used' to zero, so that if we return an error, the pattern
2139 printer (for debugging) will think there's no pattern. We reset it
2143 /* Always count groups, whether or not bufp->no_sub is set. */
2145 bufp->re_ngroups = 0;
2147 /* Allocate index translation array if needed. */
2148 if (bufp->external_to_internal_register == 0) {
2149 bufp->external_to_internal_register_size =
2150 INIT_REG_TRANSLATE_SIZE;
2151 RETALLOC(bufp->external_to_internal_register,
2152 bufp->external_to_internal_register_size, int);
2154 /* Initialize translations to impossible value to aid debugging. */
2158 bufp->external_to_internal_register[0] = 0;
2159 for (i = 1; i < bufp->external_to_internal_register_size; i++)
2160 bufp->external_to_internal_register[i] =
2164 #if !defined (emacs) && !defined (SYNTAX_TABLE)
2165 /* Initialize the syntax table. */
2169 if (bufp->allocated == 0) {
2171 /* If zero allocated, but buffer is non-null, try to
2172 realloc enough space. This loses if buffer's address
2173 is bogus, but that is the user's responsibility. */
2174 RETALLOC(bufp->buffer, INIT_BUF_SIZE, unsigned char);
2176 /* Caller did not allocate a buffer. Do it for them. */
2177 bufp->buffer = TALLOC(INIT_BUF_SIZE, unsigned char);
2179 if (!bufp->buffer) {
2184 bufp->allocated = INIT_BUF_SIZE;
2187 begalt = buf_end = bufp->buffer;
2189 /* Loop through the uncompiled pattern until we're at the end. */
2195 if ( /* If at start of pattern, it's an operator. */
2197 /* If context independent, it's an operator. */
2198 || syntax & RE_CONTEXT_INDEP_ANCHORS
2199 /* Otherwise, depends on what's come before. */
2200 || at_begline_loc_p(pattern, p, syntax))
2208 if ( /* If at end of pattern, it's an
2211 /* If context independent, it's an
2213 || syntax & RE_CONTEXT_INDEP_ANCHORS
2214 /* Otherwise, depends on what's
2216 || at_endline_loc_p(p, pend, syntax))
2225 if ((syntax & RE_BK_PLUS_QM) ||
2226 (syntax & RE_LIMITED_OPS)) {
2232 re_bool zero_times_ok;
2233 re_bool many_times_ok;
2236 /* If there is no previous pattern... */
2238 if (syntax & RE_CONTEXT_INVALID_OPS) {
2241 } else if (!(syntax & RE_CONTEXT_INDEP_OPS)) {
2246 /* true means zero/many matches are allowed. */
2247 zero_times_ok = c != '+';
2248 many_times_ok = c != '?';
2250 /* true means match shortest string possible. */
2253 /* If there is a sequence of repetition chars, collapse it
2254 down to just one (the right one). We can't combine
2255 interval operators with these because of, e.g., `a{2}*',
2256 which should only match an even number of `a's. */
2261 || (!(syntax & RE_BK_PLUS_QM)
2262 && (c == '+' || c == '?'))) ;
2264 else if (syntax & RE_BK_PLUS_QM
2272 if (!(c1 == '+' || c1 == '?')) {
2284 /* If we get here, we found another repeat
2286 if (!(syntax & RE_NO_MINIMAL_MATCHING)) {
2287 /* "*?" and "+?" and "??" are okay (and
2288 mean match minimally), but other
2289 sequences (such as "*??" and "+++")
2290 are rejected (reserved for future
2292 if (minimal || c != '?') {
2298 zero_times_ok |= c != '+';
2299 many_times_ok |= c != '?';
2303 /* Star, etc. applied to an empty pattern is equivalent
2304 to an empty pattern. */
2309 /* Now we know whether zero matches is allowed
2310 and whether two or more matches is allowed
2311 and whether we want minimal or maximal matching. */
2313 if (!many_times_ok) {
2315 0: /on_failure_jump to 6
2320 GET_BUFFER_SPACE(6);
2321 INSERT_JUMP(jump, laststart,
2324 INSERT_JUMP(on_failure_jump,
2328 } else if (zero_times_ok) {
2332 6: /on_failure_jump to 3
2335 GET_BUFFER_SPACE(6);
2336 INSERT_JUMP(jump, laststart,
2339 STORE_JUMP(on_failure_jump,
2346 3: /on_failure_jump to 0
2349 GET_BUFFER_SPACE(3);
2350 STORE_JUMP(on_failure_jump,
2351 buf_end, laststart);
2355 /* Are we optimizing this jump? */
2356 re_bool keep_string_p = false;
2358 if (many_times_ok) {
2359 /* More than one repetition is allowed,
2360 so put in at the end a backward
2361 relative jump from `buf_end' to
2362 before the next jump we're going to
2363 put in below (which jumps from
2364 laststart to after this jump).
2366 But if we are at the `*' in the exact
2367 sequence `.*\n', insert an
2368 unconditional jump backwards to the
2369 ., instead of the beginning of the
2370 loop. This way we only push a
2371 failure point once, instead of every
2372 time through the loop. */
2373 assert(p - 1 > pattern);
2375 /* Allocate the space for the jump. */
2376 GET_BUFFER_SPACE(3);
2378 /* We know we are not at the first
2379 character of the pattern, because
2380 laststart was nonzero. And we've
2381 already incremented `p', by the way,
2382 to be the character after the `*'.
2383 Do we have to do something analogous
2384 here for null bytes, because of
2386 if (*(p - 2) == '.' &&
2390 !(syntax & RE_DOT_NEWLINE)) {
2395 keep_string_p = true;
2397 /* Anything else. */
2403 /* We've added more stuff to the
2408 /* On failure, jump from laststart to buf_end +
2409 3, which will be the end of the buffer after
2410 this jump is inserted. */
2411 GET_BUFFER_SPACE(3);
2412 INSERT_JUMP(keep_string_p ?
2413 on_failure_keep_string_jump
2415 laststart, buf_end + 3);
2418 if (!zero_times_ok) {
2419 /* At least one repetition is required,
2420 so insert a `dummy_failure_jump'
2421 before the initial `on_failure_jump'
2422 instruction of the loop. This effects
2423 a skip over that instruction the
2424 first time we hit that loop. */
2425 GET_BUFFER_SPACE(3);
2426 INSERT_JUMP(dummy_failure_jump,
2437 laststart = buf_end;
2442 /* XEmacs change: this whole section */
2443 re_bool had_char_class = false;
2445 re_bool has_extended_chars = false;
2446 REGISTER Lisp_Object rtab = Qnil;
2454 /* Ensure that we have enough space to push a charset:
2455 the opcode, the length count, and the bitset; 34
2457 GET_BUFFER_SPACE(34);
2459 laststart = buf_end;
2461 /* We test `*p == '^' twice, instead of using an if
2462 statement, so we only need one BUF_PUSH. */
2463 BUF_PUSH(*p == '^' ? charset_not : charset);
2467 /* Remember the first position in the bracket
2471 /* Push the number of bytes in the bitmap. */
2472 BUF_PUSH((1 << BYTEWIDTH) / BYTEWIDTH);
2474 /* Clear the whole map. */
2475 memset(buf_end, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2477 /* charset_not matches newline according to a syntax
2479 if ((re_opcode_t) buf_end[-2] == charset_not
2480 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2484 start_over_with_extended:
2485 if (has_extended_chars) {
2486 /* There are extended chars here, which means we
2487 need to start over and shift to unified
2488 range-table format. */
2489 if (buf_end[-2] == charset)
2490 buf_end[-2] = charset_mule;
2492 buf_end[-2] = charset_mule_not;
2495 /* go back to the beginning of the charset,
2496 after a possible ^. */
2497 rtab = Vthe_lisp_rangetab;
2498 Fclear_range_table(rtab);
2500 /* charset_not matches newline according to a
2502 if ((re_opcode_t) buf_end[-1] ==
2505 RE_HAT_LISTS_NOT_NEWLINE))
2506 SET_EITHER_BIT('\n');
2510 /* Read in characters and ranges, setting map bits. */
2519 if (c >= 0x80 && !has_extended_chars) {
2520 has_extended_chars = 1;
2521 /* Frumble-bumble, we've found some
2522 extended chars. Need to start over,
2523 process everything using the general
2524 extended-char mechanism, and need to
2525 use charset_mule and charset_mule_not
2526 instead of charset and
2528 goto start_over_with_extended;
2531 /* \ might escape characters inside [...] and
2533 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) &&
2543 && !has_extended_chars) {
2544 has_extended_chars = 1;
2545 goto start_over_with_extended;
2552 /* Could be the end of the bracket
2553 expression. If it's not (i.e., when
2554 the bracket expression is `[]' so
2555 far), the ']' character bit gets set
2557 if (c == ']' && p != p1 + 1)
2560 /* Look ahead to see if it's a range
2561 when the last thing was a character
2563 if (had_char_class && c == '-'
2569 /* Look ahead to see if it's a range
2570 when the last thing was a character:
2571 if this is a hyphen not at the
2572 beginning or the end of a list, then
2573 it's the range operator. */
2575 && !(p - 2 >= pattern
2577 && !(p - 3 >= pattern
2584 if (*(const unsigned char *)p >= 0x80
2585 && !has_extended_chars) {
2586 has_extended_chars = 1;
2587 goto start_over_with_extended;
2589 if (has_extended_chars)
2590 ret = compile_extended_range
2591 (&p, pend, translate,
2596 (&p, pend, translate,
2598 if (ret != REG_NOERROR) {
2604 else if (p[0] == '-' && p[1] != ']') {
2605 /* This handles ranges made up of
2609 /* Move past the `-'. */
2613 if (*(const unsigned char*)p >= 0x80
2614 && !has_extended_chars) {
2615 has_extended_chars = 1;
2616 goto start_over_with_extended;
2618 if (has_extended_chars)
2619 ret = compile_extended_range(
2620 &p, pend, translate,
2624 ret = compile_range(
2625 &p, pend, translate,
2627 if (ret != REG_NOERROR) {
2633 /* See if we're at the beginning of a possible
2636 else if (syntax & RE_CHAR_CLASSES &&
2637 c == '[' && *p == ':') {
2638 /* Leave room for the null. */
2639 char str[CHAR_CLASS_MAX_LENGTH + 1];
2644 /* If pattern is `[[:'. */
2650 /* #### This code is unused.
2651 Correctness is not checked
2652 after TRT table change. */
2654 if (c == ':' || c == ']'
2657 CHAR_CLASS_MAX_LENGTH)
2659 str[c1++] = (char)c;
2663 /* If isn't a word bracketed by
2664 `[:' and `:]': undo the
2665 ending character, the
2666 letters, and leave the
2667 leading `:' and `[' (but set
2669 if (c == ':' && *p == ']') {
2675 if (wct == RECC_ERROR) {
2680 /* Throw away the ] at
2700 re_iswctype(ch, wct)) {
2704 if (re_iswctype(ch, wct)) {
2709 had_char_class = true;
2715 SET_EITHER_BIT('[');
2716 SET_EITHER_BIT(':');
2717 had_char_class = false;
2720 had_char_class = false;
2726 if (has_extended_chars) {
2727 /* We have a range table, not a bit vector. */
2729 unified_range_table_bytes_needed
2731 GET_BUFFER_SPACE(bytes_needed);
2732 unified_range_table_copy_data(rtab,
2735 unified_range_table_bytes_used
2740 /* Discard any (non)matching list bytes that are
2741 all 0 at the end of the map. Decrease the
2742 map-length byte too. */
2743 while ((int)buf_end[-1] > 0
2744 && buf_end[buf_end[-1] - 1] == 0)
2746 buf_end += buf_end[-1];
2751 if (syntax & RE_NO_BK_PARENS)
2757 if (syntax & RE_NO_BK_PARENS)
2763 if (syntax & RE_NEWLINE_ALT)
2769 if (syntax & RE_NO_BK_VBAR)
2775 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2776 goto handle_interval;
2786 /* Do not translate the character after the \, so that we can
2787 distinguish, e.g., \B from \b, even if we normally would
2788 translate, e.g., B to b. */