1 /* Extended regular expression matching and search library,
2 version 0.12, extended for XEmacs.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
6 Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
7 Copyright (C) 1995 Sun Microsystems, Inc.
8 Copyright (C) 1995 Ben Wing.
10 This file is part of SXEmacs
12 SXEmacs is free software: you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation, either version 3 of the License, or
15 (at your option) any later version.
17 SXEmacs is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program. If not, see <http://www.gnu.org/licenses/>. */
26 /* Synched up with: FSF 19.29. */
28 /* Changes made for XEmacs:
30 (1) the REGEX_BEGLINE_CHECK code from the XEmacs v18 regex routines
31 was added. This causes a huge speedup in font-locking.
32 (2) Rel-alloc is disabled when the MMAP version of rel-alloc is
33 being used, because it's too slow -- all those calls to mmap()
34 add humongous overhead.
35 (3) Lots and lots of changes for Mule. They are bracketed by
36 `#ifdef MULE' or with comments that have `XEmacs' in them.
43 #ifndef REGISTER /* Rigidly enforced as of 20.3 */
52 /* Converts the pointer to the char to BEG-based offset from the start. */
53 #define PTR_TO_OFFSET(d) (MATCHING_IN_FIRST_STRING \
54 ? (d) - string1 : (d) - (string2 - size1))
56 #define PTR_TO_OFFSET(d) 0
59 /* We assume non-Mule if emacs isn't defined. */
64 /* We need this for `regex.h', and perhaps for the Emacs include files. */
65 #include <sys/types.h>
67 /* This is for other GNU distributions with internationalized messages. */
68 #if defined (I18N3) && (defined (HAVE_LIBINTL_H) || defined (_LIBC))
71 # define gettext(msgid) (msgid)
74 /* XEmacs: define this to add in a speedup for patterns anchored at
75 the beginning of a line. Keep the ifdefs so that it's easier to
76 tell where/why this code has diverged from v19. */
77 #define REGEX_BEGLINE_CHECK
79 /* XEmacs: the current mmap-based ralloc handles small blocks very
80 poorly, so we disable it here. */
82 #if (defined (REL_ALLOC) && defined (HAVE_MMAP)) || defined(DOUG_LEA_MALLOC)
86 /* The `emacs' switch turns on certain matching commands
87 that make sense only in Emacs. */
96 Lisp_Object Vthe_lisp_rangetab;
98 void complex_vars_of_regex(void)
100 Vthe_lisp_rangetab = Fmake_range_table();
101 staticpro(&Vthe_lisp_rangetab);
106 void complex_vars_of_regex(void)
112 #define RE_TRANSLATE(ch) TRT_TABLE_OF (translate, (Emchar) ch)
113 #define TRANSLATE_P(tr) (!NILP (tr) && CHAR_TABLEP(tr))
115 /* just massage that xfree thing */
117 #if defined HAVE_BDWGC && defined EF_USE_BDWGC
125 /* If we are not linking with Emacs proper,
126 we can't use the relocating allocator
127 even if config.h says that we can. */
130 #if defined (STDC_HEADERS) || defined (_LIBC)
132 # include <stdbool.h>
138 /* Types normally included via lisp.h */
139 #include <stddef.h> /* for ptrdiff_t */
142 #ifndef DECLARE_NOTHING
143 #define DECLARE_NOTHING struct nosuchstruct
149 #define charptr_emchar(str) ((Emchar) (str)[0])
151 #define INC_CHARPTR(p) ((p)++)
152 #define DEC_CHARPTR(p) ((p)--)
156 /* Define the syntax stuff for \<, \>, etc. */
158 /* This must be nonzero for the wordchar and notwordchar pattern
159 commands in re_match_2. */
166 extern char *re_syntax_table;
168 #else /* not SYNTAX_TABLE */
170 /* How many characters in the character set. */
171 #define CHAR_SET_SIZE 256
173 static char re_syntax_table[CHAR_SET_SIZE];
175 static void init_syntax_once(void)
180 const char *word_syntax_chars =
181 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
183 memset(re_syntax_table, 0, sizeof(re_syntax_table));
185 while (*word_syntax_chars)
186 re_syntax_table[(unsigned int)(*word_syntax_chars++)] =
193 #endif /* SYNTAX_TABLE */
195 #define SYNTAX_UNSAFE(ignored, c) re_syntax_table[c]
196 #undef SYNTAX_FROM_CACHE
197 #define SYNTAX_FROM_CACHE SYNTAX_UNSAFE
199 #define RE_TRANSLATE(c) translate[(unsigned char) (c)]
200 #define TRANSLATE_P(tr) tr
202 #if defined HAVE_BDWGC && defined EF_USE_BDWGC
203 # if defined HAVE_GC_GC_H
205 # elif defined HAVE_GC_H
208 # define xmalloc GC_MALLOC
209 # define xmalloc_atomic GC_MALLOC_ATOMIC
210 # define xrealloc GC_REALLOC
211 # define xfree GC_FREE
212 # define xstrdup GC_STRDUP
213 #else /* !HAVE_BDWGC */
214 # define xmalloc malloc
215 # define xmalloc_atomic malloc
216 # define xrealloc realloc
218 # define xstrdup strdup
219 #endif /* HAVE_BDWGC */
224 /* Under XEmacs, this is needed because we don't define it elsewhere. */
225 #ifdef SWITCH_ENUM_BUG
226 #define SWITCH_ENUM_CAST(x) ((int)(x))
228 #define SWITCH_ENUM_CAST(x) (x)
231 /* Get the interface, including the syntax bits. */
234 /* isalpha etc. are used for the character classes. */
237 /* Jim Meyering writes:
239 "... Some ctype macros are valid only for character codes that
240 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
241 using /bin/cc or gcc but without giving an ansi option). So, all
242 ctype uses should be through macros like ISPRINT... If
243 STDC_HEADERS is defined, then autoconf has verified that the ctype
244 macros don't need to be guarded with references to isascii. ...
245 Defining isascii to 1 should let any compiler worth its salt
246 eliminate the && through constant folding." */
248 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
249 #define ISASCII_1(c) 1
251 #define ISASCII_1(c) isascii(c)
255 /* The IS*() macros can be passed any character, including an extended
256 one. We need to make sure there are no crashes, which would occur
257 otherwise due to out-of-bounds array references. */
258 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c))
260 #define ISASCII(c) ISASCII_1 (c)
264 #define ISBLANK(c) (ISASCII (c) && isblank (c))
266 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
269 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
271 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
274 /* stolen from FSF/src/charset.h */
275 #define SINGLE_BYTE_CHAR_P(c) (((unsigned int)(c) & 0xFF) == (c))
276 /* 1 if C is an ASCII character. */
277 #define IS_REAL_ASCII(c) ((c) < 128)
278 /* 1 if C is a unibyte character. */
279 #define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P(c))
281 #define ISPRINT(c) (ISASCII (c) && isprint (c))
282 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
283 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
284 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
285 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
286 #define ISLOWER(c) (ISASCII (c) && islower (c))
287 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
288 #define ISSPACE(c) (ISASCII (c) && isspace (c))
289 #define ISUPPER(c) (ISASCII (c) && isupper (c))
290 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
295 __attribute__((always_inline));
299 Lisp_Object ct = regex_emacs_buffer->mirror_syntax_table;
301 return (SYNTAX(XCHAR_TABLE(ct), c) == Sword);
304 # define ISWORD(c) (ISALPHA(c))
308 #define NULL (void *)0
311 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
312 since ours (we hope) works properly with all combinations of
313 machines, compilers, `char' and `unsigned char' argument types.
314 (Per Bothner suggested the basic approach.) */
315 #undef SIGN_EXTEND_CHAR
317 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
318 #else /* not __STDC__ */
319 /* As in Harbison and Steele. */
320 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
323 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
324 use `alloca' instead of `malloc'. This is because using malloc in
325 re_search* or re_match* could cause memory leaks when C-g is used in
326 Emacs; also, malloc is slower and causes storage fragmentation. On
327 the other hand, malloc is more portable, and easier to debug.
329 Because we sometimes use alloca, some routines have to be macros,
330 not functions -- `alloca'-allocated space disappears at the end of the
331 function it is called in. */
335 #define REGEX_ALLOCATE xmalloc
336 #define REGEX_REALLOCATE(source, osize, nsize) xrealloc(source, nsize)
337 #define REGEX_FREE xfree
339 #else /* !REGEX_MALLOC */
341 /* Emacs already defines alloca, sometimes. */
344 /* Make alloca work the best possible way. */
346 #define alloca __builtin_alloca
347 #else /* not __GNUC__ */
350 #else /* not __GNUC__ or HAVE_ALLOCA_H */
351 #ifndef _AIX /* Already did AIX, up at the top. */
353 #endif /* not _AIX */
354 #endif /* HAVE_ALLOCA_H */
355 #endif /* __GNUC__ */
357 #endif /* not alloca */
359 #define REGEX_ALLOCATE alloca
361 /* Assumes a `char *destination' variable. */
362 #define REGEX_REALLOCATE(source, osize, nsize) \
363 (destination = (char *) alloca (nsize), \
364 memmove (destination, source, osize), \
367 /* No need to do anything to free, after alloca. */
368 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
370 #endif /* REGEX_MALLOC */
372 /* Define how to allocate the failure stack. */
376 #define REGEX_ALLOCATE_STACK(size) \
377 r_alloc ((char **) &failure_stack_ptr, (size))
378 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
379 r_re_alloc ((char **) &failure_stack_ptr, (nsize))
380 #define REGEX_FREE_STACK(ptr) \
381 r_alloc_free ((void **) &failure_stack_ptr)
383 #else /* !REL_ALLOC */
387 #define REGEX_ALLOCATE_STACK xmalloc
388 #define REGEX_REALLOCATE_STACK(source, osize, nsize) xrealloc(source, nsize)
389 #define REGEX_FREE_STACK xfree
391 #else /* !REGEX_MALLOC */
393 #define REGEX_ALLOCATE_STACK alloca
395 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
396 REGEX_REALLOCATE (source, osize, nsize)
397 /* No need to explicitly free anything. */
398 #define REGEX_FREE_STACK(arg)
400 #endif /* REGEX_MALLOC */
401 #endif /* REL_ALLOC */
403 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
404 `string1' or just past its end. This works if PTR is NULL, which is
406 #define FIRST_STRING_P(ptr) \
407 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
409 /* (Re)Allocate N items of type T using malloc, or fail. */
410 #define TALLOC(n, t) \
411 ((t *)xmalloc((n) * sizeof (t)))
412 #define TALLOC_ATOMIC(n, t) \
413 ((t *)xmalloc_atomic((n) * sizeof (t)))
414 #define RETALLOC(addr, n, t) \
415 ((addr) = (t *)xrealloc(addr, (n) * sizeof (t)))
416 #define RETALLOC_IF(addr, n, t) \
418 RETALLOC((addr), (n), t); \
420 (addr) = TALLOC ((n), t)
421 #define REGEX_TALLOC(n, t) \
422 ((t *)REGEX_ALLOCATE((n) * sizeof (t)))
424 #define BYTEWIDTH 8 /* In bits. */
426 #define STREQ(s1, s2) (strcmp (s1, s2) == 0)
430 #define MAX(a, b) ((a) > (b) ? (a) : (b))
431 #define MIN(a, b) ((a) < (b) ? (a) : (b))
433 /* Type of source-pattern and string chars. */
434 typedef const unsigned char re_char;
436 typedef char re_bool;
440 /* These are the command codes that appear in compiled regular
441 expressions. Some opcodes are followed by argument bytes. A
442 command code can specify any interpretation whatsoever for its
443 arguments. Zero bytes may appear in the compiled regular expression. */
448 /* Succeed right away--no more backtracking. */
451 /* Followed by one byte giving n, then by n literal bytes. */
454 /* Matches any (more or less) character. */
457 /* Matches any one char belonging to specified set. First
458 following byte is number of bitmap bytes. Then come bytes
459 for a bitmap saying which chars are in. Bits in each byte
460 are ordered low-bit-first. A character is in the set if its
461 bit is 1. A character too large to have a bit in the map is
462 automatically not in the set. */
465 /* Same parameters as charset, but match any character that is
466 not one of those specified. */
469 /* Start remembering the text that is matched, for storing in a
470 register. Followed by one byte with the register number, in
471 the range 1 to the pattern buffer's re_ngroups
472 field. Then followed by one byte with the number of groups
473 inner to this one. (This last has to be part of the
474 start_memory only because we need it in the on_failure_jump
478 /* Stop remembering the text that is matched and store it in a
479 memory register. Followed by one byte with the register
480 number, in the range 1 to `re_ngroups' in the
481 pattern buffer, and one byte with the number of inner groups,
482 just like `start_memory'. (We need the number of inner
483 groups here because we don't have any easy way of finding the
484 corresponding start_memory when we're at a stop_memory.) */
487 /* Match a duplicate of something remembered. Followed by one
488 byte containing the register number. */
491 /* Fail unless at beginning of line. */
494 /* Fail unless at end of line. */
497 /* Succeeds if at beginning of buffer (if emacs) or at beginning
498 of string to be matched (if not). */
501 /* Analogously, for end of buffer/string. */
504 /* Followed by two byte relative address to which to jump. */
507 /* Same as jump, but marks the end of an alternative. */
510 /* Followed by two-byte relative address of place to resume at
511 in case of failure. */
514 /* Like on_failure_jump, but pushes a placeholder instead of the
515 current string position when executed. */
516 on_failure_keep_string_jump,
518 /* Throw away latest failure point and then jump to following
519 two-byte relative address. */
522 /* Change to pop_failure_jump if know won't have to backtrack to
523 match; otherwise change to jump. This is used to jump
524 back to the beginning of a repeat. If what follows this jump
525 clearly won't match what the repeat does, such that we can be
526 sure that there is no use backtracking out of repetitions
527 already matched, then we change it to a pop_failure_jump.
528 Followed by two-byte address. */
531 /* Jump to following two-byte address, and push a dummy failure
532 point. This failure point will be thrown away if an attempt
533 is made to use it for a failure. A `+' construct makes this
534 before the first repeat. Also used as an intermediary kind
535 of jump when compiling an alternative. */
538 /* Push a dummy failure point and continue. Used at the end of
542 /* Followed by two-byte relative address and two-byte number n.
543 After matching N times, jump to the address upon failure. */
546 /* Followed by two-byte relative address, and two-byte number n.
547 Jump to the address N times, then fail. */
550 /* Set the following two-byte relative address to the
551 subsequent two-byte number. The address *includes* the two
555 wordchar, /* Matches any word-constituent character. */
556 notwordchar, /* Matches any char that is not a word-constituent. */
558 wordbeg, /* Succeeds if at word beginning. */
559 wordend, /* Succeeds if at word end. */
561 wordbound, /* Succeeds if at a word boundary. */
562 notwordbound /* Succeeds if not at a word boundary. */
564 , before_dot, /* Succeeds if before point. */
565 at_dot, /* Succeeds if at point. */
566 after_dot, /* Succeeds if after point. */
568 /* Matches any character whose syntax is specified. Followed by
569 a byte which contains a syntax code, e.g., Sword. */
572 /* Matches any character whose syntax is not that specified. */
576 /* need extra stuff to be able to properly work with XEmacs/Mule
577 characters (which may take up more than one byte) */
578 , charset_mule, /* Matches any character belonging to specified set.
579 The set is stored in "unified range-table
580 format"; see rangetab.c. Unlike the `charset'
581 opcode, this can handle arbitrary characters. */
583 charset_mule_not /* Same parameters as charset_mule, but match any
584 character that is not one of those specified. */
585 /* 97/2/17 jhod: The following two were merged back in from the Mule
586 2.3 code to enable some language specific processing */
587 , categoryspec, /* Matches entries in the character category tables */
588 notcategoryspec /* The opposite of the above */
592 /* Common operations on the compiled pattern. */
594 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
596 #define STORE_NUMBER(destination, number) \
598 (destination)[0] = (number) & 0377; \
599 (destination)[1] = (number) >> 8; \
602 /* Same as STORE_NUMBER, except increment DESTINATION to
603 the byte after where the number is stored. Therefore, DESTINATION
604 must be an lvalue. */
606 #define STORE_NUMBER_AND_INCR(destination, number) \
608 STORE_NUMBER (destination, number); \
609 (destination) += 2; \
612 /* Put into DESTINATION a number stored in two contiguous bytes starting
615 #define EXTRACT_NUMBER(destination, source) \
617 (destination) = *(source) & 0377; \
618 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
621 #ifdef REGEX_DEBUG_FLAG
622 static void extract_number(int *dest, re_char * source)
624 int temp = SIGN_EXTEND_CHAR(*(source + 1));
625 *dest = *source & 0377;
629 #ifndef EXTRACT_MACROS /* To debug the macros. */
630 #undef EXTRACT_NUMBER
631 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
632 #endif /* not EXTRACT_MACROS */
634 #endif /* REGEX_DEBUG_FLAG */
636 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
637 SOURCE must be an lvalue. */
639 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
641 EXTRACT_NUMBER (destination, source); \
645 #ifdef REGEX_DEBUG_FLAG
647 extract_number_and_incr(int *destination, unsigned char **source)
649 extract_number(destination, *source);
653 #ifndef EXTRACT_MACROS
654 #undef EXTRACT_NUMBER_AND_INCR
655 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
656 extract_number_and_incr (&dest, &src)
657 #endif /* not EXTRACT_MACROS */
659 #endif /* REGEX_DEBUG_FLAG */
661 /* If REGEX_DEBUG_FLAG is defined, Regex prints many voluminous messages about
662 what it is doing (if the variable `debug' is nonzero). If linked with the
663 main program in `iregex.c', you can enter patterns and strings interactively.
664 And if linked with the main program in `main.c' and the other test files, you
665 can run the already-written tests. */
667 #if defined (REGEX_DEBUG_FLAG)
669 /* We use standard I/O for debugging. */
673 /* XEmacs provides its own version of assert() */
674 /* It is useful to test things that ``must'' be true when debugging. */
679 /* no point in having this one, since we do not offer any accessor to this */
680 static int debug = 0;
683 #define DEBUG_STATEMENT(e) e
684 #define DEBUG_PRINT1(x) printf(x)
685 #define DEBUG_PRINT2(x1, x2) printf(x1, x2)
686 #define DEBUG_PRINT3(x1, x2, x3) printf(x1, x2, x3)
687 #define DEBUG_PRINT4(x1, x2, x3, x4) printf(x1, x2, x3, x4)
688 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
689 print_partial_compiled_pattern(s, e)
690 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
691 print_double_string (w, s1, sz1, s2, sz2)
693 /* Print the fastmap in human-readable form. */
695 static void print_fastmap(char *fastmap)
697 unsigned was_a_range = 0;
700 while (i < (1 << BYTEWIDTH)) {
704 while (i < (1 << BYTEWIDTH) && fastmap[i]) {
717 /* Print a compiled pattern string in human-readable form, starting at
718 the START pointer into it and ending just before the pointer END. */
720 /* can't help it ... the ugly mule code changes `start' upon alignment,
721 * so this can't have the const qualifier ... bugger */
723 print_partial_compiled_pattern(unsigned char *start, re_char *end)
726 unsigned char *p = start;
734 /* Loop over pattern commands. */
736 printf("%ld:\t", (long)(p - start));
738 switch ((const re_opcode_t)*p++) {
745 printf("/exactn/%d", mcnt);
755 printf("/start_memory/%d/%d", mcnt, *p++);
760 printf("/stop_memory/%d/%d", mcnt, *p++);
764 printf("/duplicate/%d", *p++);
773 REGISTER int c, last = -100;
774 REGISTER int in_range = 0;
776 printf("/charset [%s",
777 (re_opcode_t) * (p - 1) == charset_not
780 assert(p + *p < pend);
782 for (c = 0; c < 256; c++)
783 if (((unsigned char)(c / 8) < *p)
786 /* Are we starting a range? */
787 if (last + 1 == c && !in_range) {
791 /* Have we broken a range? */
792 else if (last + 1 != c
815 case charset_mule_not: {
818 printf("/charset_mule [%s",
819 (re_opcode_t) * (p - 1) == charset_mule_not
821 /* u_r_t_nentries() CHANGES its arg, why on earth
822 * is this marked const here then? :O -hrop */
823 nentries = unified_range_table_nentries(p);
824 for (i = 0; i < nentries; i++) {
825 EMACS_INT first, last;
826 Lisp_Object dummy_val;
828 /* u_r_t_get_range CHANGES its arg ...
829 * p can't be const thence */
830 unified_range_table_get_range(p, i,
837 printf("(0x%lx)", (long)first);
848 p += unified_range_table_bytes_used(p);
861 case on_failure_jump:
862 extract_number_and_incr(&mcnt, &p);
863 printf("/on_failure_jump to %ld",
864 (long)(p + mcnt - start));
867 case on_failure_keep_string_jump:
868 extract_number_and_incr(&mcnt, &p);
869 printf("/on_failure_keep_string_jump to %ld",
870 (long)(p + mcnt - start));
873 case dummy_failure_jump:
874 extract_number_and_incr(&mcnt, &p);
875 printf("/dummy_failure_jump to %ld",
876 (long)(p + mcnt - start));
879 case push_dummy_failure:
880 printf("/push_dummy_failure");
884 extract_number_and_incr(&mcnt, &p);
885 printf("/maybe_pop_jump to %ld",
886 (long)(p + mcnt - start));
889 case pop_failure_jump:
890 extract_number_and_incr(&mcnt, &p);
891 printf("/pop_failure_jump to %ld",
892 (long)(p + mcnt - start));
896 extract_number_and_incr(&mcnt, &p);
897 printf("/jump_past_alt to %ld",
898 (long)(p + mcnt - start));
902 extract_number_and_incr(&mcnt, &p);
903 printf("/jump to %ld", (long)(p + mcnt - start));
907 extract_number_and_incr(&mcnt, &p);
908 extract_number_and_incr(&mcnt2, &p);
909 printf("/succeed_n to %ld, %d times",
910 (long)(p + mcnt - start), mcnt2);
914 extract_number_and_incr(&mcnt, &p);
915 extract_number_and_incr(&mcnt2, &p);
916 printf("/jump_n to %ld, %d times",
917 (long)(p + mcnt - start), mcnt2);
921 extract_number_and_incr(&mcnt, &p);
922 extract_number_and_incr(&mcnt2, &p);
923 printf("/set_number_at location %ld to %d",
924 (long)(p + mcnt - start), mcnt2);
928 printf("/wordbound");
932 printf("/notwordbound");
944 printf("/before_dot");
952 printf("/after_dot");
956 printf("/syntaxspec");
962 printf("/notsyntaxspec");
968 /* 97/2/17 jhod Mule category patch */
970 printf("/categoryspec");
975 case notcategoryspec:
976 printf("/notcategoryspec");
980 /* end of category patch */
989 printf("/notwordchar");
1002 printf("?%d", *(p - 1));
1008 printf("%ld:\tend of pattern.\n", (long)(p - start));
1012 print_compiled_pattern(struct re_pattern_buffer *bufp)
1016 re_char *buffer = bufp->buffer;
1018 unsigned char *buffer = bufp->buffer;
1021 print_partial_compiled_pattern(buffer, buffer + bufp->used);
1022 printf("%ld bytes used/%ld bytes allocated.\n", bufp->used,
1025 if (bufp->fastmap_accurate && bufp->fastmap) {
1026 printf("fastmap: ");
1027 print_fastmap(bufp->fastmap);
1030 printf("re_nsub: %ld\t", (long)bufp->re_nsub);
1031 printf("re_ngroups: %ld\t", (long)bufp->re_ngroups);
1032 printf("regs_alloc: %d\t", bufp->regs_allocated);
1033 printf("can_be_null: %d\t", bufp->can_be_null);
1034 printf("newline_anchor: %d\n", bufp->newline_anchor);
1035 printf("no_sub: %d\t", bufp->no_sub);
1036 printf("not_bol: %d\t", bufp->not_bol);
1037 printf("not_eol: %d\t", bufp->not_eol);
1038 printf("syntax: %d\n", bufp->syntax);
1039 /* Perhaps we should print the translate table? */
1040 /* and maybe the category table? */
1042 if (bufp->external_to_internal_register) {
1045 printf("external_to_internal_register:\n");
1046 for (i = 0; i <= bufp->re_nsub; i++) {
1049 printf("%d -> %d", i,
1050 bufp->external_to_internal_register[i]);
1057 print_double_string(re_char * where, re_char * string1, int size1,
1058 re_char * string2, int size2)
1063 Element_count this_char;
1065 if (FIRST_STRING_P(where)) {
1066 for (this_char = where - string1; this_char < size1;
1068 putchar(string1[this_char]);
1073 for (this_char = where - string2; this_char < size2;
1075 putchar(string2[this_char]);
1079 #else /* not REGEX_DEBUG_FLAG */
1084 #define DEBUG_STATEMENT(e)
1085 #define DEBUG_PRINT1(x)
1086 #define DEBUG_PRINT2(x1, x2)
1087 #define DEBUG_PRINT3(x1, x2, x3)
1088 #define DEBUG_PRINT4(x1, x2, x3, x4)
1089 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1090 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1092 #endif /* REGEX_DEBUG_FLAG */
1094 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1095 also be assigned to arbitrarily: each pattern buffer stores its own
1096 syntax, so it can be changed between regex compilations. */
1097 /* This has no initializer because initialized variables in Emacs
1098 become read-only after dumping. */
1099 reg_syntax_t re_syntax_options;
1101 /* Specify the precise syntax of regexps for compilation. This provides
1102 for compatibility for various utilities which historically have
1103 different, incompatible syntaxes.
1105 The argument SYNTAX is a bit mask comprised of the various bits
1106 defined in regex.h. We return the old syntax. */
1108 reg_syntax_t re_set_syntax(reg_syntax_t syntax)
1110 reg_syntax_t ret = re_syntax_options;
1112 re_syntax_options = syntax;
1116 /* This table gives an error message for each of the error codes listed
1117 in regex.h. Obviously the order here has to be same as there.
1118 POSIX doesn't require that we do anything for REG_NOERROR,
1119 but why not be nice? */
1121 static const char *re_error_msgid[] = {
1122 "Success", /* REG_NOERROR */
1123 "No match", /* REG_NOMATCH */
1124 "Invalid regular expression", /* REG_BADPAT */
1125 "Invalid collation character", /* REG_ECOLLATE */
1126 "Invalid character class name", /* REG_ECTYPE */
1127 "Trailing backslash", /* REG_EESCAPE */
1128 "Invalid back reference", /* REG_ESUBREG */
1129 "Unmatched [ or [^", /* REG_EBRACK */
1130 "Unmatched ( or \\(", /* REG_EPAREN */
1131 "Unmatched \\{", /* REG_EBRACE */
1132 "Invalid content of \\{\\}", /* REG_BADBR */
1133 "Invalid range end", /* REG_ERANGE */
1134 "Memory exhausted", /* REG_ESPACE */
1135 "Invalid preceding regular expression", /* REG_BADRPT */
1136 "Premature end of regular expression", /* REG_EEND */
1137 "Regular expression too big", /* REG_ESIZE */
1138 "Unmatched ) or \\)", /* REG_ERPAREN */
1140 "Invalid syntax designator", /* REG_ESYNTAX */
1143 "Ranges may not span charsets", /* REG_ERANGESPAN */
1144 "Invalid category designator", /* REG_ECATEGORY */
1148 /* Avoiding alloca during matching, to placate r_alloc. */
1150 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1151 searching and matching functions should not call alloca. On some
1152 systems, alloca is implemented in terms of malloc, and if we're
1153 using the relocating allocator routines, then malloc could cause a
1154 relocation, which might (if the strings being searched are in the
1155 ralloc heap) shift the data out from underneath the regexp
1158 Here's another reason to avoid allocation: Emacs
1159 processes input from X in a signal handler; processing X input may
1160 call malloc; if input arrives while a matching routine is calling
1161 malloc, then we're scrod. But Emacs can't just block input while
1162 calling matching routines; then we don't notice interrupts when
1163 they come in. So, Emacs blocks input around all regexp calls
1164 except the matching calls, which it leaves unprotected, in the
1165 faith that they will not malloc. */
1167 /* Normally, this is fine. */
1168 #define MATCH_MAY_ALLOCATE
1170 /* When using GNU C, we are not REALLY using the C alloca, no matter
1171 what config.h may say. So don't take precautions for it. */
1176 /* The match routines may not allocate if (1) they would do it with malloc
1177 and (2) it's not safe for them to use malloc.
1178 Note that if REL_ALLOC is defined, matching would not use malloc for the
1179 failure stack, but we would still use it for the register vectors;
1180 so REL_ALLOC should not affect this. */
1181 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1182 #undef MATCH_MAY_ALLOCATE
1185 /* Failure stack declarations and macros; both re_compile_fastmap and
1186 re_match_2 use a failure stack. These have to be macros because of
1187 REGEX_ALLOCATE_STACK. */
1189 /* Number of failure points for which to initially allocate space
1190 when matching. If this number is exceeded, we allocate more
1191 space, so it is not a hard limit. */
1192 #ifndef INIT_FAILURE_ALLOC
1193 #define INIT_FAILURE_ALLOC 20
1196 /* Roughly the maximum number of failure points on the stack. Would be
1197 exactly that if always used MAX_FAILURE_SPACE each time we failed.
1198 This is a variable only so users of regex can assign to it; we never
1199 change it ourselves. */
1200 #if defined (MATCH_MAY_ALLOCATE) || defined (REGEX_MALLOC)
1201 /* 4400 was enough to cause a crash on Alpha OSF/1,
1202 whose default stack limit is 2mb. */
1203 int re_max_failures = 50000;
1205 int re_max_failures = 2000;
1208 union fail_stack_elt {
1209 const unsigned char *pointer;
1213 typedef union fail_stack_elt fail_stack_elt_t;
1216 fail_stack_elt_t *stack;
1218 Element_count avail; /* Offset of next open position. */
1221 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1222 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1223 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1225 /* Define macros to initialize and free the failure stack.
1226 Do `return -2' if the alloc fails. */
1228 #ifdef MATCH_MAY_ALLOCATE
1229 #define INIT_FAIL_STACK() \
1231 fail_stack.stack = (fail_stack_elt_t *) \
1232 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * \
1233 sizeof (fail_stack_elt_t)); \
1235 if (fail_stack.stack == NULL) \
1238 fail_stack.size = INIT_FAILURE_ALLOC; \
1239 fail_stack.avail = 0; \
1242 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1244 #define INIT_FAIL_STACK() \
1246 fail_stack.avail = 0; \
1249 #define RESET_FAIL_STACK()
1252 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1254 Return 1 if succeeds, and 0 if either ran out of memory
1255 allocating space for it or it was already too large.
1257 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1259 #define DOUBLE_FAIL_STACK(fail_stack) \
1260 ((int) (fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
1262 : ((fail_stack).stack = (fail_stack_elt_t *) \
1263 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1264 (fail_stack).size * sizeof (fail_stack_elt_t), \
1265 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1267 (fail_stack).stack == NULL \
1269 : ((fail_stack).size <<= 1, \
1272 /* Push pointer POINTER on FAIL_STACK.
1273 Return 1 if was able to do so and 0 if ran out of memory allocating
1275 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1276 ((FAIL_STACK_FULL () \
1277 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1279 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1282 /* Push a pointer value onto the failure stack.
1283 Assumes the variable `fail_stack'. Probably should only
1284 be called from within `PUSH_FAILURE_POINT'. */
1285 #define PUSH_FAILURE_POINTER(item) \
1286 fail_stack.stack[fail_stack.avail++].pointer = \
1287 (const unsigned char*)(item)
1289 /* This pushes an integer-valued item onto the failure stack.
1290 Assumes the variable `fail_stack'. Probably should only
1291 be called from within `PUSH_FAILURE_POINT'. */
1292 #define PUSH_FAILURE_INT(item) \
1293 fail_stack.stack[fail_stack.avail++].integer = (item)
1295 /* Push a fail_stack_elt_t value onto the failure stack.
1296 Assumes the variable `fail_stack'. Probably should only
1297 be called from within `PUSH_FAILURE_POINT'. */
1298 #define PUSH_FAILURE_ELT(item) \
1299 fail_stack.stack[fail_stack.avail++] = (item)
1301 /* These three POP... operations complement the three PUSH... operations.
1302 All assume that `fail_stack' is nonempty. */
1303 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1304 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1305 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1307 /* Used to omit pushing failure point id's when we're not debugging. */
1308 #ifdef REGEX_DEBUG_FLAG
1309 #define DEBUG_PUSH PUSH_FAILURE_INT
1310 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1312 #define DEBUG_PUSH(item)
1313 #define DEBUG_POP(item_addr)
1316 /* Push the information about the state we will need
1317 if we ever fail back to it.
1319 Requires variables fail_stack, regstart, regend, reg_info, and
1320 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
1323 Does `return FAILURE_CODE' if runs out of memory. */
1325 #if !defined (REGEX_MALLOC) && !defined (REL_ALLOC)
1326 #define DECLARE_DESTINATION char *destination
1328 #define DECLARE_DESTINATION DECLARE_NOTHING
1331 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1333 DECLARE_DESTINATION; \
1334 /* Must be int, so when we don't save any registers, the\
1335 arithmetic \ of 0 + -1 isn't done as unsigned. */ \
1338 DEBUG_STATEMENT (failure_id++); \
1339 DEBUG_STATEMENT (nfailure_points_pushed++); \
1340 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1341 DEBUG_PRINT2 (" Before push, next avail: %lu\n", \
1342 (long unsigned int)(fail_stack).avail); \
1343 DEBUG_PRINT2 (" size: %lu\n", \
1344 (long unsigned int)(fail_stack).size); \
1346 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
1347 DEBUG_PRINT2 (" available: %ld\n", \
1348 (long) REMAINING_AVAIL_SLOTS); \
1350 /* Ensure we have enough space allocated for what \
1351 * we will push. */ \
1352 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) { \
1353 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1354 return failure_code; \
1356 DEBUG_PRINT2 ("\n Doubled stack; size now: %lu\n", \
1357 (unsigned long) (fail_stack).size); \
1358 DEBUG_PRINT2 (" slots available: %ld\n", \
1359 (long) REMAINING_AVAIL_SLOTS); \
1362 /* Push the info, starting with the registers. */ \
1363 DEBUG_PRINT1 ("\n"); \
1365 for (this_reg = lowest_active_reg; \
1366 this_reg <= highest_active_reg; \
1368 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1369 DEBUG_STATEMENT (num_regs_pushed++); \
1371 DEBUG_PRINT2(" start: %p\n", \
1372 regstart[this_reg]); \
1373 PUSH_FAILURE_POINTER(regstart[this_reg]); \
1375 DEBUG_PRINT2 (" end: %p\n", \
1376 regend[this_reg]); \
1377 PUSH_FAILURE_POINTER(regend[this_reg]); \
1379 DEBUG_PRINT2 (" info: 0x%p\n ", \
1380 ®_info[this_reg]); \
1381 DEBUG_PRINT2 (" match_null=%d", \
1382 REG_MATCH_NULL_STRING_P \
1383 (reg_info[this_reg])); \
1384 DEBUG_PRINT2 (" active=%d", \
1385 IS_ACTIVE (reg_info[this_reg])); \
1386 DEBUG_PRINT2 (" matched_something=%d", \
1388 (reg_info[this_reg])); \
1389 DEBUG_PRINT2 (" ever_matched_something=%d", \
1390 EVER_MATCHED_SOMETHING \
1391 (reg_info[this_reg])); \
1392 DEBUG_PRINT1 ("\n"); \
1393 PUSH_FAILURE_ELT(reg_info[this_reg].word); \
1396 DEBUG_PRINT2 (" Pushing low active reg: %d\n", \
1397 lowest_active_reg); \
1398 PUSH_FAILURE_INT (lowest_active_reg); \
1400 DEBUG_PRINT2 (" Pushing high active reg: %d\n", \
1401 highest_active_reg); \
1402 PUSH_FAILURE_INT (highest_active_reg); \
1404 DEBUG_PRINT2 (" Pushing pattern 0x%lx: \n", \
1405 (long) pattern_place); \
1406 DEBUG_PRINT_COMPILED_PATTERN(bufp, pattern_place, pend); \
1407 PUSH_FAILURE_POINTER (pattern_place); \
1409 DEBUG_PRINT2(" Pushing string %p: `", \
1411 DEBUG_PRINT_DOUBLE_STRING(string_place, \
1414 DEBUG_PRINT1("'\n"); \
1415 PUSH_FAILURE_POINTER (string_place); \
1417 DEBUG_PRINT2(" Pushing failure id: %u\n", failure_id); \
1418 DEBUG_PUSH(failure_id); \
1421 /* This is the number of items that are pushed and popped on the stack
1422 for each register. */
1423 #define NUM_REG_ITEMS 3
1425 /* Individual items aside from the registers. */
1427 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1429 #define NUM_NONREG_ITEMS 4
1432 /* We push at most this many items on the stack. */
1433 /* We used to use (num_regs - 1), which is the number of registers
1434 this regexp will save; but that was changed to 5
1435 to avoid stack overflow for a regexp with lots of parens. */
1436 #define MAX_FAILURE_ITEMS (5 * (NUM_REG_ITEMS + NUM_NONREG_ITEMS))
1438 /* We actually push this many items. */
1439 #define NUM_FAILURE_ITEMS \
1440 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
1443 /* How many items can still be added to the stack without overflowing it. */
1444 #define REMAINING_AVAIL_SLOTS ((int) ((fail_stack).size - (fail_stack).avail))
1446 /* Pops what PUSH_FAIL_STACK pushes.
1448 We restore into the parameters, all of which should be lvalues:
1449 STR -- the saved data position.
1450 PAT -- the saved pattern position.
1451 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1452 REGSTART, REGEND -- arrays of string positions.
1453 REG_INFO -- array of information about each subexpression.
1455 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1456 `pend', `string1', `size1', `string2', and `size2'. */
1458 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, \
1459 regstart, regend, reg_info) \
1461 DEBUG_STATEMENT (fail_stack_elt_t ffailure_id;) \
1463 const unsigned char *string_temp; \
1465 assert (!FAIL_STACK_EMPTY ()); \
1467 /* Remove failure points and point to \
1468 * how many regs pushed. */ \
1469 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1470 DEBUG_PRINT2 (" Before pop, next avail: %lu\n", \
1471 (unsigned long) fail_stack.avail); \
1472 DEBUG_PRINT2 (" size: %lu\n", \
1473 (unsigned long) fail_stack.size); \
1475 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1477 DEBUG_POP (&ffailure_id.integer); \
1478 DEBUG_PRINT2 (" Popping failure id: %u\n", \
1479 * (unsigned int *) &ffailure_id); \
1481 /* If the saved string location is NULL, it \
1482 came from an on_failure_keep_string_jump opcode, \
1483 and we want to throw away the saved NULL, thus \
1484 retaining our current position in the string. */ \
1485 string_temp = POP_FAILURE_POINTER (); \
1486 if (string_temp != NULL) \
1487 str = string_temp; \
1489 DEBUG_PRINT2(" Popping string %p: `", str); \
1490 DEBUG_PRINT_DOUBLE_STRING(str, \
1493 DEBUG_PRINT1("'\n"); \
1495 pat = (unsigned char *) POP_FAILURE_POINTER(); \
1496 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1497 DEBUG_PRINT_COMPILED_PATTERN(bufp, pat, pend); \
1499 /* Restore register info. */ \
1500 high_reg = (unsigned) POP_FAILURE_INT (); \
1501 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1503 low_reg = (unsigned) POP_FAILURE_INT (); \
1504 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1506 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) { \
1507 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
1509 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1510 DEBUG_PRINT2 (" info: %p\n", \
1511 ®_info[this_reg]); \
1513 regend[this_reg] = POP_FAILURE_POINTER (); \
1514 DEBUG_PRINT2 (" end: %p\n", \
1515 regend[this_reg]); \
1517 regstart[this_reg] = POP_FAILURE_POINTER (); \
1518 DEBUG_PRINT2 (" start: %p\n", \
1519 regstart[this_reg]); \
1522 set_regs_matched_done = 0; \
1523 DEBUG_STATEMENT (nfailure_points_popped++); \
1524 } while (0) /* POP_FAILURE_POINT */
1526 /* Structure for per-register (a.k.a. per-group) information.
1527 Other register information, such as the
1528 starting and ending positions (which are addresses), and the list of
1529 inner groups (which is a bits list) are maintained in separate
1532 We are making a (strictly speaking) nonportable assumption here: that
1533 the compiler will pack our bit fields into something that fits into
1534 the type of `word', i.e., is something that fits into one item on the
1538 fail_stack_elt_t word;
1540 /* This field is one if this group can match the empty string,
1541 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1542 #define MATCH_NULL_UNSET_VALUE 3
1543 unsigned match_null_string_p:2;
1544 unsigned is_active:1;
1545 unsigned matched_something:1;
1546 unsigned ever_matched_something:1;
1548 } register_info_type;
1550 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1551 #define IS_ACTIVE(R) ((R).bits.is_active)
1552 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1553 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1555 /* Call this when have matched a real character; it sets `matched' flags
1556 for the subexpressions which we are currently inside. Also records
1557 that those subexprs have matched. */
1558 #define SET_REGS_MATCHED() \
1561 if (!set_regs_matched_done) \
1564 set_regs_matched_done = 1; \
1565 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1567 MATCHED_SOMETHING (reg_info[r]) \
1568 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1575 /* Registers are set to a sentinel when they haven't yet matched. */
1576 static unsigned char reg_unset_dummy;
1577 #define REG_UNSET_VALUE (®_unset_dummy)
1578 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1580 /* Subroutine declarations and macros for regex_compile. */
1582 /* Fetch the next character in the uncompiled pattern---translating it
1583 if necessary. Also cast from a signed character in the constant
1584 string passed to us by the user to an unsigned char that we can use
1585 as an array index (in, e.g., `translate'). */
1586 #define PATFETCH(c) \
1589 c = TRANSLATE (c); \
1592 /* Fetch the next character in the uncompiled pattern, with no
1594 #define PATFETCH_RAW(c) \
1595 do {if (p == pend) return REG_EEND; \
1596 assert (p < pend); \
1597 c = charptr_emchar (p); \
1601 /* Go backwards one character in the pattern. */
1602 #define PATUNFETCH DEC_CHARPTR (p)
1606 #define PATFETCH_EXTENDED(emch) \
1607 do {if (p == pend) return REG_EEND; \
1608 assert (p < pend); \
1609 emch = charptr_emchar ((const Bufbyte *) p); \
1611 if (TRANSLATE_P (translate) && emch < 0x80) \
1612 emch = (Emchar) (unsigned char) RE_TRANSLATE (emch); \
1615 #define PATFETCH_RAW_EXTENDED(emch) \
1616 do {if (p == pend) return REG_EEND; \
1617 assert (p < pend); \
1618 emch = charptr_emchar ((const Bufbyte *) p); \
1622 #define PATUNFETCH_EXTENDED DEC_CHARPTR (p)
1624 #define PATFETCH_EITHER(emch) \
1626 if (has_extended_chars) \
1627 PATFETCH_EXTENDED (emch); \
1632 #define PATFETCH_RAW_EITHER(emch) \
1634 if (has_extended_chars) \
1635 PATFETCH_RAW_EXTENDED (emch); \
1637 PATFETCH_RAW (emch); \
1640 #define PATUNFETCH_EITHER \
1642 if (has_extended_chars) \
1643 PATUNFETCH_EXTENDED (emch); \
1645 PATUNFETCH (emch); \
1648 #else /* not MULE */
1650 #define PATFETCH_EITHER(emch) PATFETCH (emch)
1651 #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch)
1652 #define PATUNFETCH_EITHER PATUNFETCH
1656 /* If `translate' is non-null, return translate[D], else just D. We
1657 cast the subscript to translate because some data is declared as
1658 `char *', to avoid warnings when a string constant is passed. But
1659 when we use a character as a subscript we must make it unsigned. */
1660 #define TRANSLATE(d) (TRANSLATE_P (translate) ? RE_TRANSLATE (d) : (d))
1662 /* Macros for outputting the compiled pattern into `buffer'. */
1664 /* If the buffer isn't allocated when it comes in, use this. */
1665 #define INIT_BUF_SIZE 32
1667 /* Make sure we have at least N more bytes of space in buffer. */
1668 #define GET_BUFFER_SPACE(n) \
1669 while (buf_end - bufp->buffer + (n) > (ptrdiff_t) bufp->allocated) \
1672 /* Make sure we have one more byte of buffer space and then add C to it. */
1673 #define BUF_PUSH(c) \
1675 GET_BUFFER_SPACE (1); \
1676 *buf_end++ = (unsigned char) (c); \
1679 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1680 #define BUF_PUSH_2(c1, c2) \
1682 GET_BUFFER_SPACE (2); \
1683 *buf_end++ = (unsigned char) (c1); \
1684 *buf_end++ = (unsigned char) (c2); \
1687 /* As with BUF_PUSH_2, except for three bytes. */
1688 #define BUF_PUSH_3(c1, c2, c3) \
1690 GET_BUFFER_SPACE (3); \
1691 *buf_end++ = (unsigned char) (c1); \
1692 *buf_end++ = (unsigned char) (c2); \
1693 *buf_end++ = (unsigned char) (c3); \
1696 /* Store a jump with opcode OP at LOC to location TO. We store a
1697 relative address offset by the three bytes the jump itself occupies. */
1698 #define STORE_JUMP(op, loc, to) \
1699 store_op1 (op, loc, (to) - (loc) - 3)
1701 /* Likewise, for a two-argument jump. */
1702 #define STORE_JUMP2(op, loc, to, arg) \
1703 store_op2 (op, loc, (to) - (loc) - 3, arg)
1705 /* Like `STORE_JUMP', but for inserting. Assume `buf_end' is the
1707 #define INSERT_JUMP(op, loc, to) \
1708 insert_op1 (op, loc, (to) - (loc) - 3, buf_end)
1710 /* Like `STORE_JUMP2', but for inserting. Assume `buf_end' is the
1712 #define INSERT_JUMP2(op, loc, to, arg) \
1713 insert_op2 (op, loc, (to) - (loc) - 3, arg, buf_end)
1715 /* This is not an arbitrary limit: the arguments which represent offsets
1716 into the pattern are two bytes long. So if 2^16 bytes turns out to
1717 be too small, many things would have to change. */
1718 #define MAX_BUF_SIZE (1L << 16)
1720 /* Extend the buffer by twice its current size via realloc and
1721 reset the pointers that pointed into the old block to point to the
1722 correct places in the new one. If extending the buffer results in it
1723 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1724 #define EXTEND_BUFFER() \
1726 re_char *old_buffer = bufp->buffer; \
1727 if (bufp->allocated == MAX_BUF_SIZE) \
1729 bufp->allocated <<= 1; \
1730 if (bufp->allocated > MAX_BUF_SIZE) \
1731 bufp->allocated = MAX_BUF_SIZE; \
1732 bufp->buffer = (unsigned char *)xrealloc( \
1733 bufp->buffer, bufp->allocated); \
1734 if (bufp->buffer == NULL) { \
1735 return REG_ESPACE; \
1737 /* If the buffer moved, move all the pointers into it. */ \
1738 if (old_buffer != bufp->buffer) { \
1739 buf_end = (buf_end - old_buffer) + bufp->buffer; \
1740 begalt = (begalt - old_buffer) + bufp->buffer; \
1741 if (fixup_alt_jump) { \
1743 (fixup_alt_jump - old_buffer) + \
1747 laststart = (laststart - old_buffer) + \
1750 if (pending_exact) { \
1752 (pending_exact - old_buffer) + \
1758 /* Since we have one byte reserved for the register number argument to
1759 {start,stop}_memory, the maximum number of groups we can report
1760 things about is what fits in that byte. */
1761 #define MAX_REGNUM 255
1763 /* But patterns can have more than `MAX_REGNUM' registers. We just
1764 ignore the excess. */
1765 typedef unsigned regnum_t;
1767 #define INIT_REG_TRANSLATE_SIZE 5
1769 /* Macros for the compile stack. */
1771 /* Since offsets can go either forwards or backwards, this type needs to
1772 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1773 typedef int pattern_offset_t;
1776 pattern_offset_t begalt_offset;
1777 pattern_offset_t fixup_alt_jump;
1778 pattern_offset_t inner_group_offset;
1779 pattern_offset_t laststart_offset;
1781 } compile_stack_elt_t;
1784 compile_stack_elt_t *stack;
1786 unsigned avail; /* Offset of next open position. */
1787 } compile_stack_type;
1789 #define INIT_COMPILE_STACK_SIZE 32
1791 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1792 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1794 /* The next available element. */
1795 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1797 /* Set the bit for character C in a bit vector. */
1798 #define SET_LIST_BIT(c) \
1799 (buf_end[((unsigned char) (c)) / BYTEWIDTH] \
1800 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1804 /* Set the "bit" for character C in a range table. */
1805 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt)
1807 /* Set the "bit" for character c in the appropriate table. */
1808 #define SET_EITHER_BIT(c) \
1810 if (has_extended_chars) \
1811 SET_RANGETAB_BIT (c); \
1816 #else /* not MULE */
1818 #define SET_EITHER_BIT(c) SET_LIST_BIT (c)
1822 /* Get the next unsigned number in the uncompiled pattern. */
1823 #define GET_UNSIGNED_NUMBER(num) \
1827 while (ISDIGIT (c)) \
1831 num = num * 10 + c - '0'; \
1839 #define CHAR_CLASS_MAX_LENGTH 9 /* Namely, `multibyte'. */
1842 static void store_op1(re_opcode_t op, unsigned char *loc, int arg);
1843 static void store_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1844 static void insert_op1(re_opcode_t op, unsigned char *loc, int arg,
1845 unsigned char *end);
1846 static void insert_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2,
1847 unsigned char *end);
1848 static re_bool at_begline_loc_p(re_char*, re_char*, reg_syntax_t);
1849 static re_bool at_endline_loc_p(re_char*, re_char*, int syntax);
1850 static re_bool group_in_compile_stack(compile_stack_type compile_stack,
1852 static reg_errcode_t compile_range(re_char ** p_ptr, re_char * pend,
1853 RE_TRANSLATE_TYPE translate,
1854 reg_syntax_t syntax, unsigned char *b);
1856 static reg_errcode_t compile_extended_range(re_char ** p_ptr,
1858 RE_TRANSLATE_TYPE translate,
1859 reg_syntax_t syntax,
1862 static re_bool group_match_null_string_p(unsigned char **p,
1864 register_info_type * reg_info);
1865 static re_bool alt_match_null_string_p(unsigned char *p, unsigned char *end,
1866 register_info_type * reg_info);
1867 static re_bool common_op_match_null_string_p(unsigned char **p,
1869 register_info_type * reg_info);
1870 static int bcmp_translate(const unsigned char *s1, const unsigned char *s2,
1871 REGISTER int len, RE_TRANSLATE_TYPE translate);
1872 static int re_match_2_internal(struct re_pattern_buffer *bufp,
1873 re_char * string1, int size1,
1874 re_char * string2, int size2, int pos,
1875 struct re_registers *regs, int stop);
1877 #ifndef MATCH_MAY_ALLOCATE
1879 /* If we cannot allocate large objects within re_match_2_internal,
1880 we make the fail stack and register vectors global.
1881 The fail stack, we grow to the maximum size when a regexp
1883 The register vectors, we adjust in size each time we
1884 compile a regexp, according to the number of registers it needs. */
1886 static fail_stack_type fail_stack;
1888 /* Size with which the following vectors are currently allocated.
1889 That is so we can make them bigger as needed,
1890 but never make them smaller. */
1891 static int regs_allocated_size;
1893 static re_char **regstart, **regend;
1894 static re_char **old_regstart, **old_regend;
1895 static re_char **best_regstart, **best_regend;
1896 static register_info_type *reg_info;
1897 static re_char **reg_dummy;
1898 static register_info_type *reg_info_dummy;
1900 /* Make the register vectors big enough for NUM_REGS registers,
1901 but don't make them smaller. */
1903 static void regex_grow_registers(int num_regs)
1905 if (num_regs > regs_allocated_size) {
1906 RETALLOC_IF(regstart, num_regs, re_char *);
1907 RETALLOC_IF(regend, num_regs, re_char *);
1908 RETALLOC_IF(old_regstart, num_regs, re_char *);
1909 RETALLOC_IF(old_regend, num_regs, re_char *);
1910 RETALLOC_IF(best_regstart, num_regs, re_char *);
1911 RETALLOC_IF(best_regend, num_regs, re_char *);
1912 RETALLOC_IF(reg_info, num_regs, register_info_type);
1913 RETALLOC_IF(reg_dummy, num_regs, re_char *);
1914 RETALLOC_IF(reg_info_dummy, num_regs, register_info_type);
1916 regs_allocated_size = num_regs;
1920 #endif /* not MATCH_MAY_ALLOCATE */
1922 /* auxiliary stuff, FSF theft */
1923 /* Character classes. */
1926 RECC_ALNUM, RECC_ALPHA, RECC_WORD,
1927 RECC_GRAPH, RECC_PRINT,
1928 RECC_LOWER, RECC_UPPER,
1929 RECC_PUNCT, RECC_CNTRL,
1930 RECC_DIGIT, RECC_XDIGIT,
1931 RECC_BLANK, RECC_SPACE,
1932 RECC_MULTIBYTE, RECC_NONASCII,
1933 RECC_ASCII, RECC_UNIBYTE
1936 /* Map a string to the char class it names (if any). */
1937 static inline re_wctype_t
1938 re_wctype(re_char *str)
1940 const char *string = (const char*)str;
1942 if (STREQ (string, "alnum")) {
1944 } else if (STREQ (string, "alpha")) {
1946 } else if (STREQ (string, "digit")) {
1948 } else if (STREQ (string, "graph")) {
1950 } else if (STREQ (string, "space")) {
1952 } else if (STREQ (string, "upper")) {
1954 } else if (STREQ (string, "lower")) {
1956 } else if (STREQ (string, "word")) {
1958 } else if (STREQ (string, "print")) {
1960 } else if (STREQ (string, "punct")) {
1962 } else if (STREQ (string, "xdigit")) {
1964 } else if (STREQ (string, "cntrl")) {
1966 } else if (STREQ (string, "blank")) {
1968 } else if (STREQ (string, "ascii")) {
1970 } else if (STREQ (string, "nonascii")) {
1971 return RECC_NONASCII;
1972 } else if (STREQ (string, "unibyte")) {
1973 return RECC_UNIBYTE;
1974 } else if (STREQ (string, "multibyte")) {
1975 return RECC_MULTIBYTE;
1981 /* True if CH is in the char class CC. */
1983 re_iswctype(int ch, re_wctype_t cc)
1987 return ISALNUM (ch);
1989 return ISALPHA (ch);
1991 return ISBLANK (ch);
1993 return ISCNTRL (ch);
1995 return ISDIGIT (ch);
1997 return ISGRAPH (ch);
1999 return ISLOWER (ch);
2001 return ISPRINT (ch);
2003 return ISPUNCT (ch);
2005 return ISSPACE (ch);
2007 return ISUPPER (ch);
2009 return ISXDIGIT (ch);
2011 return IS_REAL_ASCII (ch);
2013 return !IS_REAL_ASCII (ch);
2017 return ISUNIBYTE((unsigned int)ch);
2018 case RECC_MULTIBYTE:
2019 return !ISUNIBYTE((unsigned int)ch);
2029 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2030 Returns one of error codes defined in `regex.h', or zero for success.
2032 Assumes the `allocated' (and perhaps `buffer') and `translate'
2033 fields are set in BUFP on entry.
2035 If it succeeds, results are put in BUFP (if it returns an error, the
2036 contents of BUFP are undefined):
2037 `buffer' is the compiled pattern;
2038 `syntax' is set to SYNTAX;
2039 `used' is set to the length of the compiled pattern;
2040 `fastmap_accurate' is zero;
2041 `re_ngroups' is the number of groups/subexpressions (including shy
2043 `re_nsub' is the number of non-shy groups in PATTERN;
2044 `not_bol' and `not_eol' are zero;
2046 The `fastmap' and `newline_anchor' fields are neither
2047 examined nor set. */
2049 static reg_errcode_t
2050 regex_compile(re_char * pattern, int size, reg_syntax_t syntax,
2051 struct re_pattern_buffer *bufp)
2053 /* We fetch characters from PATTERN here. We declare these as int
2054 (or possibly long) so that chars above 127 can be used as
2055 array indices. The macros that fetch a character from the pattern
2056 make sure to coerce to unsigned char before assigning, so we won't
2057 get bitten by negative numbers here. */
2058 /* XEmacs change: used to be unsigned char. */
2059 REGISTER EMACS_INT c, c1;
2061 /* A random temporary spot in PATTERN. */
2064 /* Points to the end of the buffer, where we should append. */
2065 REGISTER unsigned char *buf_end;
2067 /* Keeps track of unclosed groups. */
2068 compile_stack_type compile_stack;
2070 /* Points to the current (ending) position in the pattern. */
2071 re_char *p = pattern;
2072 re_char *pend = pattern + size;
2074 /* How to translate the characters in the pattern. */
2075 RE_TRANSLATE_TYPE translate = bufp->translate;
2077 /* Address of the count-byte of the most recently inserted `exactn'
2078 command. This makes it possible to tell if a new exact-match
2079 character can be added to that command or if the character requires
2080 a new `exactn' command. */
2081 unsigned char *pending_exact = 0;
2083 /* Address of start of the most recently finished expression.
2084 This tells, e.g., postfix * where to find the start of its
2085 operand. Reset at the beginning of groups and alternatives. */
2086 unsigned char *laststart = 0;
2088 /* Address of beginning of regexp, or inside of last group. */
2089 unsigned char *begalt;
2091 /* Place in the uncompiled pattern (i.e., the {) to
2092 which to go back if the interval is invalid. */
2093 re_char *beg_interval;
2095 /* Address of the place where a forward jump should go to the end of
2096 the containing expression. Each alternative of an `or' -- except the
2097 last -- ends with a forward jump of this sort. */
2098 unsigned char *fixup_alt_jump = 0;
2100 /* Counts open-groups as they are encountered. Remembered for the
2101 matching close-group on the compile stack, so the same register
2102 number is put in the stop_memory as the start_memory. */
2103 regnum_t regnum = 0;
2104 auto inline void free_stack(void) __attribute__((always_inline));
2106 /* we need to close over compile_stack */
2107 auto inline void free_stack(void)
2109 xfree(compile_stack.stack);
2113 #ifdef REGEX_DEBUG_FLAG
2114 DEBUG_PRINT1("\nCompiling pattern: ");
2118 for (debug_count = 0; debug_count < size; debug_count++)
2119 putchar(pattern[debug_count]);
2124 /* Initialize the compile stack. */
2125 compile_stack.stack =
2126 TALLOC(INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2127 if (compile_stack.stack == NULL) {
2130 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2131 compile_stack.avail = 0;
2133 /* Initialize the pattern buffer. */
2134 bufp->syntax = syntax;
2135 bufp->fastmap_accurate = 0;
2136 bufp->not_bol = bufp->not_eol = 0;
2138 /* Set `used' to zero, so that if we return an error, the pattern
2139 printer (for debugging) will think there's no pattern. We reset it
2143 /* Always count groups, whether or not bufp->no_sub is set. */
2145 bufp->re_ngroups = 0;
2147 /* Allocate index translation array if needed. */
2148 if (bufp->external_to_internal_register == 0) {
2149 bufp->external_to_internal_register_size =
2150 INIT_REG_TRANSLATE_SIZE;
2151 RETALLOC(bufp->external_to_internal_register,
2152 bufp->external_to_internal_register_size, int);
2154 /* Initialize translations to impossible value to aid debugging. */
2158 bufp->external_to_internal_register[0] = 0;
2159 for (i = 1; i < bufp->external_to_internal_register_size; i++)
2160 bufp->external_to_internal_register[i] =
2164 #if !defined (emacs) && !defined (SYNTAX_TABLE)
2165 /* Initialize the syntax table. */
2169 if (bufp->allocated == 0) {
2171 /* If zero allocated, but buffer is non-null, try to
2172 realloc enough space. This loses if buffer's address
2173 is bogus, but that is the user's responsibility. */
2174 RETALLOC(bufp->buffer, INIT_BUF_SIZE, unsigned char);
2176 /* Caller did not allocate a buffer. Do it for them. */
2177 bufp->buffer = TALLOC(INIT_BUF_SIZE, unsigned char);
2179 if (!bufp->buffer) {
2184 bufp->allocated = INIT_BUF_SIZE;
2187 begalt = buf_end = bufp->buffer;
2189 /* Loop through the uncompiled pattern until we're at the end. */
2195 if ( /* If at start of pattern, it's an operator. */
2197 /* If context independent, it's an operator. */
2198 || syntax & RE_CONTEXT_INDEP_ANCHORS
2199 /* Otherwise, depends on what's come before. */
2200 || at_begline_loc_p(pattern, p, syntax))
2208 if ( /* If at end of pattern, it's an
2211 /* If context independent, it's an
2213 || syntax & RE_CONTEXT_INDEP_ANCHORS
2214 /* Otherwise, depends on what's
2216 || at_endline_loc_p(p, pend, syntax))
2225 if ((syntax & RE_BK_PLUS_QM) ||
2226 (syntax & RE_LIMITED_OPS)) {
2232 re_bool zero_times_ok;
2233 re_bool many_times_ok;
2236 /* If there is no previous pattern... */
2238 if (syntax & RE_CONTEXT_INVALID_OPS) {
2241 } else if (!(syntax & RE_CONTEXT_INDEP_OPS)) {
2246 /* true means zero/many matches are allowed. */
2247 zero_times_ok = c != '+';
2248 many_times_ok = c != '?';
2250 /* true means match shortest string possible. */
2253 /* If there is a sequence of repetition chars, collapse it
2254 down to just one (the right one). We can't combine
2255 interval operators with these because of, e.g., `a{2}*',
2256 which should only match an even number of `a's. */
2261 || (!(syntax & RE_BK_PLUS_QM)
2262 && (c == '+' || c == '?'))) ;
2264 else if (syntax & RE_BK_PLUS_QM
2272 if (!(c1 == '+' || c1 == '?')) {
2284 /* If we get here, we found another repeat
2286 if (!(syntax & RE_NO_MINIMAL_MATCHING)) {
2287 /* "*?" and "+?" and "??" are okay (and
2288 mean match minimally), but other
2289 sequences (such as "*??" and "+++")
2290 are rejected (reserved for future
2292 if (minimal || c != '?') {
2298 zero_times_ok |= c != '+';
2299 many_times_ok |= c != '?';
2303 /* Star, etc. applied to an empty pattern is equivalent
2304 to an empty pattern. */
2309 /* Now we know whether zero matches is allowed
2310 and whether two or more matches is allowed
2311 and whether we want minimal or maximal matching. */
2313 if (!many_times_ok) {
2315 0: /on_failure_jump to 6
2320 GET_BUFFER_SPACE(6);
2321 INSERT_JUMP(jump, laststart,
2324 INSERT_JUMP(on_failure_jump,
2328 } else if (zero_times_ok) {
2332 6: /on_failure_jump to 3
2335 GET_BUFFER_SPACE(6);
2336 INSERT_JUMP(jump, laststart,
2339 STORE_JUMP(on_failure_jump,
2346 3: /on_failure_jump to 0
2349 GET_BUFFER_SPACE(3);
2350 STORE_JUMP(on_failure_jump,
2351 buf_end, laststart);
2355 /* Are we optimizing this jump? */
2356 re_bool keep_string_p = false;
2358 if (many_times_ok) {
2359 /* More than one repetition is allowed,
2360 so put in at the end a backward
2361 relative jump from `buf_end' to
2362 before the next jump we're going to
2363 put in below (which jumps from
2364 laststart to after this jump).
2366 But if we are at the `*' in the exact
2367 sequence `.*\n', insert an
2368 unconditional jump backwards to the
2369 ., instead of the beginning of the
2370 loop. This way we only push a
2371 failure point once, instead of every
2372 time through the loop. */
2373 assert(p - 1 > pattern);
2375 /* Allocate the space for the jump. */
2376 GET_BUFFER_SPACE(3);
2378 /* We know we are not at the first
2379 character of the pattern, because
2380 laststart was nonzero. And we've
2381 already incremented `p', by the way,
2382 to be the character after the `*'.
2383 Do we have to do something analogous
2384 here for null bytes, because of
2386 if (*(p - 2) == '.' &&
2390 !(syntax & RE_DOT_NEWLINE)) {
2395 keep_string_p = true;
2397 /* Anything else. */
2403 /* We've added more stuff to the
2408 /* On failure, jump from laststart to buf_end +
2409 3, which will be the end of the buffer after
2410 this jump is inserted. */
2411 GET_BUFFER_SPACE(3);
2412 INSERT_JUMP(keep_string_p ?
2413 on_failure_keep_string_jump
2415 laststart, buf_end + 3);
2418 if (!zero_times_ok) {
2419 /* At least one repetition is required,
2420 so insert a `dummy_failure_jump'
2421 before the initial `on_failure_jump'
2422 instruction of the loop. This effects
2423 a skip over that instruction the
2424 first time we hit that loop. */
2425 GET_BUFFER_SPACE(3);
2426 INSERT_JUMP(dummy_failure_jump,
2437 laststart = buf_end;
2442 /* XEmacs change: this whole section */
2443 re_bool had_char_class = false;
2445 re_bool has_extended_chars = false;
2446 REGISTER Lisp_Object rtab = Qnil;
2454 /* Ensure that we have enough space to push a charset:
2455 the opcode, the length count, and the bitset; 34
2457 GET_BUFFER_SPACE(34);
2459 laststart = buf_end;
2461 /* We test `*p == '^' twice, instead of using an if
2462 statement, so we only need one BUF_PUSH. */
2463 BUF_PUSH(*p == '^' ? charset_not : charset);
2467 /* Remember the first position in the bracket
2471 /* Push the number of bytes in the bitmap. */
2472 BUF_PUSH((1 << BYTEWIDTH) / BYTEWIDTH);
2474 /* Clear the whole map. */
2475 memset(buf_end, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2477 /* charset_not matches newline according to a syntax
2479 if ((re_opcode_t) buf_end[-2] == charset_not
2480 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2484 start_over_with_extended:
2485 if (has_extended_chars) {
2486 /* There are extended chars here, which means we
2487 need to start over and shift to unified
2488 range-table format. */
2489 if (buf_end[-2] == charset)
2490 buf_end[-2] = charset_mule;
2492 buf_end[-2] = charset_mule_not;
2495 /* go back to the beginning of the charset,
2496 after a possible ^. */
2497 rtab = Vthe_lisp_rangetab;
2498 Fclear_range_table(rtab);
2500 /* charset_not matches newline according to a
2502 if ((re_opcode_t) buf_end[-1] ==
2505 RE_HAT_LISTS_NOT_NEWLINE))
2506 SET_EITHER_BIT('\n');
2510 /* Read in characters and ranges, setting map bits. */
2519 if (c >= 0x80 && !has_extended_chars) {
2520 has_extended_chars = 1;
2521 /* Frumble-bumble, we've found some
2522 extended chars. Need to start over,
2523 process everything using the general
2524 extended-char mechanism, and need to
2525 use charset_mule and charset_mule_not
2526 instead of charset and
2528 goto start_over_with_extended;
2531 /* \ might escape characters inside [...] and
2533 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) &&
2543 && !has_extended_chars) {
2544 has_extended_chars = 1;
2545 goto start_over_with_extended;
2552 /* Could be the end of the bracket
2553 expression. If it's not (i.e., when
2554 the bracket expression is `[]' so
2555 far), the ']' character bit gets set
2557 if (c == ']' && p != p1 + 1)
2560 /* Look ahead to see if it's a range
2561 when the last thing was a character
2563 if (had_char_class && c == '-'
2569 /* Look ahead to see if it's a range
2570 when the last thing was a character:
2571 if this is a hyphen not at the
2572 beginning or the end of a list, then
2573 it's the range operator. */
2575 && !(p - 2 >= pattern
2577 && !(p - 3 >= pattern
2584 if (*(const unsigned char *)p >= 0x80
2585 && !has_extended_chars) {
2586 has_extended_chars = 1;
2587 goto start_over_with_extended;
2589 if (has_extended_chars)
2590 ret = compile_extended_range
2591 (&p, pend, translate,
2596 (&p, pend, translate,
2598 if (ret != REG_NOERROR) {
2604 else if (p[0] == '-' && p[1] != ']') {
2605 /* This handles ranges made up of
2609 /* Move past the `-'. */
2613 if (*(const unsigned char*)p >= 0x80
2614 && !has_extended_chars) {
2615 has_extended_chars = 1;
2616 goto start_over_with_extended;
2618 if (has_extended_chars)
2619 ret = compile_extended_range(
2620 &p, pend, translate,
2624 ret = compile_range(
2625 &p, pend, translate,
2627 if (ret != REG_NOERROR) {
2633 /* See if we're at the beginning of a possible
2636 else if (syntax & RE_CHAR_CLASSES &&
2637 c == '[' && *p == ':') {
2638 /* Leave room for the null. */
2639 char str[CHAR_CLASS_MAX_LENGTH + 1];
2644 /* If pattern is `[[:'. */
2650 /* #### This code is unused.
2651 Correctness is not checked
2652 after TRT table change. */
2654 if (c == ':' || c == ']'
2657 CHAR_CLASS_MAX_LENGTH)
2659 str[c1++] = (char)c;
2663 /* If isn't a word bracketed by
2664 `[:' and `:]': undo the
2665 ending character, the
2666 letters, and leave the
2667 leading `:' and `[' (but set
2669 if (c == ':' && *p == ']') {
2675 if (wct == RECC_ERROR) {
2680 /* Throw away the ] at
2700 re_iswctype(ch, wct)) {
2704 if (re_iswctype(ch, wct)) {
2709 had_char_class = true;
2715 SET_EITHER_BIT('[');
2716 SET_EITHER_BIT(':');
2717 had_char_class = false;
2720 had_char_class = false;
2726 if (has_extended_chars) {
2727 /* We have a range table, not a bit vector. */
2729 unified_range_table_bytes_needed
2731 GET_BUFFER_SPACE(bytes_needed);
2732 unified_range_table_copy_data(rtab,
2735 unified_range_table_bytes_used
2740 /* Discard any (non)matching list bytes that are
2741 all 0 at the end of the map. Decrease the
2742 map-length byte too. */
2743 while ((int)buf_end[-1] > 0
2744 && buf_end[buf_end[-1] - 1] == 0)
2746 buf_end += buf_end[-1];
2751 if (syntax & RE_NO_BK_PARENS)
2757 if (syntax & RE_NO_BK_PARENS)
2763 if (syntax & RE_NEWLINE_ALT)
2769 if (syntax & RE_NO_BK_VBAR)
2775 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2776 goto handle_interval;
2786 /* Do not translate the character after the \, so that we can
2787 distinguish, e.g., \B from \b, even if we normally would
2788 translate, e.g., B to b. */
2793 if (syntax & RE_NO_BK_PARENS)
2794 goto normal_backslash;
2801 if (!(syntax & RE_NO_SHY_GROUPS)
2802 && p != pend && *p == '?') {
2823 /* Record the translation from capturing group index to
2824 register number, reallocating table as needed. */
2827 external_to_internal_register_size
2832 external_to_internal_register_size;
2834 external_to_internal_register_size
2837 external_to_internal_register,
2839 external_to_internal_register_size,
2845 external_to_internal_register_size;
2848 external_to_internal_register
2855 external_to_internal_register
2860 if (COMPILE_STACK_FULL) {
2861 RETALLOC(compile_stack.stack,
2864 compile_stack_elt_t);
2865 if (compile_stack.stack == NULL)
2868 compile_stack.size <<= 1;
2871 /* These are the values to restore when
2872 we hit end of this group. They are
2873 all relative offsets, so that if the
2874 whole pattern moves because of
2875 realloc, they will still be
2877 COMPILE_STACK_TOP.begalt_offset =
2878 begalt - bufp->buffer;
2879 COMPILE_STACK_TOP.fixup_alt_jump =
2884 COMPILE_STACK_TOP.laststart_offset =
2885 buf_end - bufp->buffer;
2886 COMPILE_STACK_TOP.regnum = r;
2888 /* We will eventually replace the 0 with
2889 the number of groups inner to this
2890 one. But do not push a start_memory
2891 for groups beyond the last one we can
2892 represent in the compiled pattern.
2893 #### bad bad bad. this will fail in
2894 lots of ways, if we ever have to
2895 backtrack for these groups.
2897 if (r <= MAX_REGNUM) {
2899 inner_group_offset =
2900 buf_end - bufp->buffer +
2902 BUF_PUSH_3(start_memory, r, 0);
2905 compile_stack.avail++;
2910 /* If we've reached MAX_REGNUM groups,
2911 then this open won't actually
2912 generate any code, so we'll have to
2913 clear pending_exact explicitly. */
2919 if (syntax & RE_NO_BK_PARENS)
2920 goto normal_backslash;
2922 if (COMPILE_STACK_EMPTY) {
2924 RE_UNMATCHED_RIGHT_PAREN_ORD) {
2925 goto normal_backslash;
2933 if (fixup_alt_jump) {
2934 /* Push a dummy failure point at the end
2935 of the alternative for a possible
2936 future `pop_failure_jump' to pop.
2937 See comments at `push_dummy_failure'
2939 BUF_PUSH(push_dummy_failure);
2941 /* We allocated space for this jump when
2942 we assigned to `fixup_alt_jump', in
2943 the `handle_alt' case below. */
2944 STORE_JUMP(jump_past_alt,
2945 fixup_alt_jump, buf_end - 1);
2948 /* See similar code for backslashed left paren
2950 if (COMPILE_STACK_EMPTY) {
2952 RE_UNMATCHED_RIGHT_PAREN_ORD) {
2960 /* Since we just checked for an empty stack
2961 above, this ``can't happen''. */
2962 assert(compile_stack.avail != 0);
2964 /* We don't just want to restore into
2965 `regnum', because later groups should
2966 continue to be numbered higher, as in
2967 `(ab)c(de)' -- the second group is
2969 regnum_t this_group_regnum;
2971 compile_stack.avail--;
2974 COMPILE_STACK_TOP.begalt_offset;
2987 COMPILE_STACK_TOP.regnum;
2988 /* If we've reached MAX_REGNUM groups,
2989 then this open won't actually
2990 generate any code, so we'll have to
2991 clear pending_exact explicitly. */
2994 /* We're at the end of the group, so now
2995 we know how many groups were inside
2997 if (this_group_regnum <= MAX_REGNUM) {
2998 unsigned char *inner_group_loc
3007 BUF_PUSH_3(stop_memory,
3015 case '|': /* `\|'. */
3016 if (syntax & RE_LIMITED_OPS
3017 || syntax & RE_NO_BK_VBAR) {
3018 goto normal_backslash;
3021 if (syntax & RE_LIMITED_OPS) {
3025 /* Insert before the previous alternative a jump
3026 which jumps to this alternative if the former
3028 GET_BUFFER_SPACE(3);
3029 INSERT_JUMP(on_failure_jump, begalt,
3034 /* The alternative before this one has a jump after it
3035 which gets executed if it gets matched. Adjust that
3036 jump so it will jump to this alternative's analogous
3037 jump (put in below, which in turn will jump to the next
3038 (if any) alternative's such jump, etc.). The last such
3039 jump jumps to the correct final destination. A picture:
3045 If we are at `b', then fixup_alt_jump right now points to a
3046 three-byte space after `a'. We'll put in the jump, set
3047 fixup_alt_jump to right after `b', and leave behind three
3048 bytes which we'll fill in when we get to after `c'. */
3051 STORE_JUMP(jump_past_alt,
3052 fixup_alt_jump, buf_end);
3054 /* Mark and leave space for a jump after this alternative,
3055 to be filled in later either by next alternative or
3056 when know we're at the end of a series of alternatives. */
3057 fixup_alt_jump = buf_end;
3058 GET_BUFFER_SPACE(3);
3066 /* If \{ is a literal. */
3067 if (!(syntax & RE_INTERVALS)
3068 /* If we're at `\{' and it's not the open-interval
3070 || ((syntax & RE_INTERVALS)
3071 && (syntax & RE_NO_BK_BRACES))
3072 || (p - 2 == pattern && p == pend))
3073 goto normal_backslash;
3077 /* If got here, then the syntax allows
3080 /* At least (most) this many matches
3082 int lower_bound = -1, upper_bound = -1;
3084 beg_interval = p - 1;
3087 if (syntax & RE_NO_BK_BRACES) {
3088 goto unfetch_interval;
3095 GET_UNSIGNED_NUMBER(lower_bound);
3098 GET_UNSIGNED_NUMBER(
3100 if (upper_bound < 0) {
3105 /* Interval such as `{1}' =>
3106 match exactly once. */
3107 upper_bound = lower_bound;
3111 || upper_bound > RE_DUP_MAX
3112 || lower_bound > upper_bound) {
3113 if (syntax & RE_NO_BK_BRACES) {
3114 goto unfetch_interval;
3121 if (!(syntax & RE_NO_BK_BRACES)) {
3130 if (syntax & RE_NO_BK_BRACES) {
3131 goto unfetch_interval;
3138 /* We just parsed a valid interval. */
3140 /* If it's invalid to have no preceding
3144 RE_CONTEXT_INVALID_OPS) {
3149 RE_CONTEXT_INDEP_OPS) {
3150 laststart = buf_end;
3152 goto unfetch_interval;
3156 /* If the upper bound is zero, don't
3157 want to succeed at all; jump from
3158 `laststart' to `b + 3', which will be
3159 the end of the buffer after we insert
3161 if (upper_bound == 0) {
3162 GET_BUFFER_SPACE(3);
3163 INSERT_JUMP(jump, laststart,
3168 /* Otherwise, we have a nontrivial interval. When
3169 we're all done, the pattern will look like:
3170 set_number_at <jump count> <upper bound>
3171 set_number_at <succeed_n count> <lower bound>
3172 succeed_n <after jump addr> <succeed_n count>
3174 jump_n <succeed_n addr> <jump count>
3175 (The upper bound and `jump_n' are omitted if
3176 `upper_bound' is 1, though.) */
3177 else { /* If the upper bound is > 1, we need to insert
3178 more at the end of the loop. */
3179 Memory_count nbytes =
3180 10 + (upper_bound > 1) * 10;
3182 GET_BUFFER_SPACE(nbytes);
3184 /* Initialize lower bound of the `succeed_n', even
3185 though it will be set during matching by its
3186 attendant `set_number_at' (inserted next),
3187 because `re_compile_fastmap' needs to know.
3188 Jump to the `jump_n' we might insert below. */
3189 INSERT_JUMP2(succeed_n,
3197 /* Code to initialize the lower bound. Insert
3198 before the `succeed_n'. The `5' is the last two
3199 bytes of this `set_number_at', plus 3 bytes of
3200 the following `succeed_n'. */
3201 insert_op2(set_number_at,
3207 if (upper_bound > 1) { /* More than one repetition is allowed, so
3208 append a backward jump to the `succeed_n'
3209 that starts this interval.
3211 When we've reached this during matching,
3212 we'll have matched the interval once, so
3213 jump back only `upper_bound - 1' times. */
3222 /* The location we want to set is the second
3223 parameter of the `jump_n'; that is `b-2' as
3224 an absolute address. `laststart' will be
3225 the `set_number_at' we're about to insert;
3226 `laststart+3' the number to set, the source
3227 for the relative address. But we are
3228 inserting into the middle of the pattern --
3229 so everything is getting moved up by 5.
3230 Conclusion: (b - 2) - (laststart + 3) + 5,
3231 i.e., b - laststart.
3233 We insert this at the beginning of the loop
3234 so that if we fail during matching, we'll
3235 reinitialize the bounds. */
3247 beg_interval = NULL;
3252 /* If an invalid interval, match the characters as literals. */
3253 assert(beg_interval);
3255 beg_interval = NULL;
3257 /* normal_char and normal_backslash need `c'. */
3260 if (!(syntax & RE_NO_BK_BRACES)) {
3261 if (p > pattern && p[-1] == '\\')
3262 goto normal_backslash;
3267 /* There is no way to specify the before_dot and after_dot
3268 operators. rms says this is ok. --karl */
3274 laststart = buf_end;
3276 /* XEmacs addition */
3277 if (c >= 0x80 || syntax_spec_code[c] == 0377) {
3281 BUF_PUSH_2(syntaxspec, syntax_spec_code[c]);
3285 laststart = buf_end;
3287 /* XEmacs addition */
3288 if (c >= 0x80 || syntax_spec_code[c] == 0377) {
3292 BUF_PUSH_2(notsyntaxspec, syntax_spec_code[c]);
3296 /* 97.2.17 jhod merged in to XEmacs from mule-2.3 */
3298 laststart = buf_end;
3300 if (c < 32 || c > 127) {
3302 return REG_ECATEGORY;
3304 BUF_PUSH_2(categoryspec, c);
3308 laststart = buf_end;
3310 if (c < 32 || c > 127) {
3312 return REG_ECATEGORY;
3314 BUF_PUSH_2(notcategoryspec, c);
3316 /* end of category patch */
3321 laststart = buf_end;
3326 laststart = buf_end;
3327 BUF_PUSH(notwordchar);
3339 BUF_PUSH(wordbound);
3343 BUF_PUSH(notwordbound);
3365 if (syntax & RE_NO_BK_REFS) {
3368 /* External register indexing. */
3371 if (reg > bufp->re_nsub) {
3376 /* Convert external to internal as soon
3378 reg = bufp->external_to_internal_register[reg];
3380 /* Can't back reference to a
3381 subexpression if inside it. */
3382 if (group_in_compile_stack(
3383 compile_stack, reg)) {
3386 laststart = buf_end;
3387 BUF_PUSH_2(duplicate, reg);
3393 if (syntax & RE_BK_PLUS_QM) {
3396 goto normal_backslash;
3400 /* You might think it would be useful for \ to
3401 mean not to translate; but if we don't
3402 translate it, it will never match
3409 /* Expects the character in `c'. */
3410 /* `p' points to the location after where `c' came
3414 /* XEmacs: modifications here for Mule. */
3415 /* `q' points to the beginning of the next char. */
3418 /* If no exactn currently being built. */
3420 /* If last exactn not at current position. */
3421 || pending_exact + *pending_exact + 1 !=
3423 /* We have only one byte following the exactn for
3425 || ((unsigned int)(*pending_exact + (q - p)) >=
3426 ((unsigned int)(1 << BYTEWIDTH) - 1))
3428 /* If followed by a repetition operator. */
3429 || *q == '*' || *q == '^'
3430 || ((syntax & RE_BK_PLUS_QM)
3431 ? *q == '\\' && (q[1] == '+'
3433 : (*q == '+' || *q == '?'))
3434 || ((syntax & RE_INTERVALS)
3435 && ((syntax & RE_NO_BK_BRACES)
3437 : (q[0] == '\\' && q[1] == '{')))) {
3438 /* Start building a new exactn. */
3440 laststart = buf_end;
3442 BUF_PUSH_2(exactn, 0);
3443 pending_exact = buf_end - 1;
3451 Bufbyte tmp_buf[MAX_EMCHAR_LEN];
3455 set_charptr_emchar(tmp_buf, c);
3457 for (i = 0; i < bt_count; i++) {
3458 BUF_PUSH(tmp_buf[i]);
3466 } /* while p != pend */
3468 /* Through the pattern now. */
3470 if (fixup_alt_jump) {
3471 STORE_JUMP(jump_past_alt, fixup_alt_jump, buf_end);
3473 if (!COMPILE_STACK_EMPTY) {
3477 /* If we don't want backtracking, force success
3478 the first time we reach the end of the compiled pattern. */
3479 if (syntax & RE_NO_POSIX_BACKTRACKING) {
3482 xfree(compile_stack.stack);
3484 /* We have succeeded; set the length of the buffer. */
3485 bufp->used = buf_end - bufp->buffer;
3487 #ifdef REGEX_DEBUG_FLAG
3489 DEBUG_PRINT1("\nCompiled pattern: \n");
3490 print_compiled_pattern(bufp);
3494 #ifndef MATCH_MAY_ALLOCATE
3495 /* Initialize the failure stack to the largest possible stack. This
3496 isn't necessary unless we're trying to avoid calling alloca in
3497 the search and match routines. */
3499 int num_regs = bufp->re_ngroups + 1;
3501 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
3502 is strictly greater than re_max_failures, the largest possible stack
3503 is 2 * re_max_failures failure points. */
3504 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS)) {
3506 (2 * re_max_failures * MAX_FAILURE_ITEMS);
3508 if (!fail_stack.stack) {
3510 (fail_stack_elt_t *)
3513 sizeof(fail_stack_elt_t));
3516 (fail_stack_elt_t *)
3517 xrealloc(fail_stack.stack,
3519 sizeof(fail_stack_elt_t)));
3523 regex_grow_registers(num_regs);
3525 #endif /* not MATCH_MAY_ALLOCATE */
3531 /* Subroutines for `regex_compile'. */
3533 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3535 static void store_op1(re_opcode_t op, unsigned char *loc, int arg)
3537 *loc = (unsigned char)op;
3538 STORE_NUMBER(loc + 1, arg);
3541 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3543 static void store_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2)
3545 *loc = (unsigned char)op;
3546 STORE_NUMBER(loc + 1, arg1);
3547 STORE_NUMBER(loc + 3, arg2);
3550 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3551 for OP followed by two-byte integer parameter ARG. */
3554 insert_op1(re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
3556 REGISTER unsigned char *pfrom = end;
3557 REGISTER unsigned char *pto = end + 3;
3559 while (pfrom != loc)
3562 store_op1(op, loc, arg);
3565 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3568 insert_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2,
3571 REGISTER unsigned char *pfrom = end;
3572 REGISTER unsigned char *pto = end + 5;
3574 while (pfrom != loc)
3577 store_op2(op, loc, arg1, arg2);
3580 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3581 after an alternative or a begin-subexpression. We assume there is at
3582 least one character before the ^. */
3585 at_begline_loc_p(re_char *pattern, re_char *p, reg_syntax_t syntax)
3587 re_char *prev = p - 2;
3588 re_bool prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3591 /* After a subexpression? */
3592 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3593 /* After an alternative? */
3595 && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3598 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3599 at least one character after the $, i.e., `P < PEND'. */
3602 at_endline_loc_p(re_char *p, re_char *pend, int syntax)
3605 re_bool next_backslash = *next == '\\';
3606 re_char *next_next = p + 1 < pend ? p + 1 : 0;
3609 /* Before a subexpression? */
3610 (syntax & RE_NO_BK_PARENS ? *next == ')'
3611 : next_backslash && next_next && *next_next == ')')
3612 /* Before an alternative? */
3613 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3614 : next_backslash && next_next && *next_next == '|');
3617 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3618 false if it's not. */
3621 group_in_compile_stack(compile_stack_type compile_stack, regnum_t regnum)
3625 for (this_element = compile_stack.avail - 1;
3626 this_element >= 0; this_element--)
3627 if (compile_stack.stack[this_element].regnum == regnum) {
3633 /* Read the ending character of a range (in a bracket expression) from the
3634 uncompiled pattern *P_PTR (which ends at PEND). We assume the
3635 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
3636 Then we set the translation of all bits between the starting and
3637 ending characters (inclusive) in the compiled pattern B.
3639 Return an error code.
3641 We use these short variable names so we can use the same macros as
3642 `regex_compile' itself. */
3644 static reg_errcode_t
3645 compile_range(re_char ** p_ptr, re_char * pend, RE_TRANSLATE_TYPE translate,
3646 reg_syntax_t syntax, unsigned char *buf_end)
3648 Element_count this_char;
3650 re_char *p = *p_ptr;
3651 int range_start, range_end;
3656 /* Even though the pattern is a signed `char *', we need to fetch
3657 with unsigned char *'s; if the high bit of the pattern character
3658 is set, the range endpoints will be negative if we fetch using a
3661 We also want to fetch the endpoints without translating them; the
3662 appropriate translation is done in the bit-setting loop below. */
3663 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
3664 range_start = ((const unsigned char *)p)[-2];
3665 range_end = ((const unsigned char *)p)[0];
3667 /* Have to increment the pointer into the pattern string, so the
3668 caller isn't still at the ending character. */
3671 /* If the start is after the end, the range is empty. */
3672 if (range_start > range_end)
3673 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3675 /* Here we see why `this_char' has to be larger than an `unsigned
3676 char' -- the range is inclusive, so if `range_end' == 0xff
3677 (assuming 8-bit characters), we would otherwise go into an infinite
3678 loop, since all characters <= 0xff. */
3679 for (this_char = range_start; this_char <= range_end; this_char++) {
3680 SET_LIST_BIT(TRANSLATE(this_char));
3688 static reg_errcode_t
3689 compile_extended_range(re_char ** p_ptr, re_char * pend,
3690 RE_TRANSLATE_TYPE translate,
3691 reg_syntax_t syntax, Lisp_Object rtab)
3693 Emchar this_char, range_start, range_end;
3699 p = (const Bufbyte *)*p_ptr;
3700 range_end = charptr_emchar(p);
3701 p--; /* back to '-' */
3702 DEC_CHARPTR(p); /* back to start of range */
3703 /* We also want to fetch the endpoints without translating them; the
3704 appropriate translation is done in the bit-setting loop below. */
3705 range_start = charptr_emchar(p);
3706 INC_CHARPTR(*p_ptr);
3708 /* If the start is after the end, the range is empty. */
3709 if (range_start > range_end)
3710 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3712 /* Can't have ranges spanning different charsets, except maybe for
3713 ranges entirely within the first 256 chars. */
3715 if ((range_start >= 0x100 || range_end >= 0x100)
3716 && CHAR_LEADING_BYTE(range_start) != CHAR_LEADING_BYTE(range_end))
3717 return REG_ERANGESPAN;
3719 /* As advertised, translations only work over the 0 - 0x7F range.
3720 Making this kind of stuff work generally is much harder.
3721 Iterating over the whole range like this would be way efficient
3722 if the range encompasses 10,000 chars or something. You'd have
3723 to do something like this:
3727 map over translation table in [range_start, range_end] of
3728 (put the mapped range in a;
3729 put the translation in b)
3730 invert the range in a and truncate to [range_start, range_end]
3731 compute the union of a, b
3732 union the result into rtab
3734 for (this_char = range_start;
3735 this_char <= range_end && this_char < 0x80; this_char++) {
3736 SET_RANGETAB_BIT(TRANSLATE(this_char));
3739 if (this_char <= range_end)
3740 put_range_table(rtab, this_char, range_end, Qt);
3747 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3748 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3749 characters can start a string that matches the pattern. This fastmap
3750 is used by re_search to skip quickly over impossible starting points.
3752 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3753 area as BUFP->fastmap.
3755 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3758 Returns 0 if we succeed, -2 if an internal error. */
3760 int re_compile_fastmap(struct re_pattern_buffer *bufp)
3763 #ifdef MATCH_MAY_ALLOCATE
3764 fail_stack_type fail_stack;
3766 DECLARE_DESTINATION;
3767 /* We don't push any register information onto the failure stack. */
3769 REGISTER char *fastmap = bufp->fastmap;
3770 unsigned char *pattern = bufp->buffer;
3771 unsigned long size = bufp->used;
3772 /* actually p supposed to carry the const qualifier, however, some
3773 silly mule code below CHANGES p and hence we cant go with the const
3774 qualifier here, leaving an `unfixable' warning on the way */
3776 unsigned char *p = pattern;
3778 re_char *p = pattern;
3780 REGISTER unsigned char *pend = pattern + size;
3783 /* This holds the pointer to the failure stack, when
3784 it is allocated relocatably. */
3785 fail_stack_elt_t *failure_stack_ptr;
3788 /* Assume that each path through the pattern can be null until
3789 proven otherwise. We set this false at the bottom of switch
3790 statement, to which we get only if a particular path doesn't
3791 match the empty string. */
3792 re_bool path_can_be_null = true;
3794 /* We aren't doing a `succeed_n' to begin with. */
3795 re_bool succeed_n_p = false;
3797 assert(fastmap != NULL && p != NULL);
3800 memset(fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3801 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3802 bufp->can_be_null = 0;
3805 if (p == pend || *p == succeed) {
3806 /* We have reached the (effective) end of pattern. */
3807 if (!FAIL_STACK_EMPTY()) {
3808 bufp->can_be_null |= path_can_be_null;
3810 /* Reset for next path. */
3811 path_can_be_null = true;
3813 /* fuck, p isnt const thanks to that
3814 * unified range table function below */
3816 p = (unsigned char*)fail_stack.
3817 stack[--fail_stack.avail].pointer;
3819 p = fail_stack.stack[--fail_stack.avail]
3829 /* We should never be about to go beyond the end of the pattern. */
3832 switch (SWITCH_ENUM_CAST((re_opcode_t) * p++)) {
3834 /* I guess the idea here is to simply not bother with a
3835 fastmap if a backreference is used, since it's too
3836 hard to figure out the fastmap for the corresponding
3837 group. Setting `can_be_null' stops `re_search_2'
3838 from using the fastmap, so that is all we do. */
3840 bufp->can_be_null = 1;
3843 /* Following are the cases which match a character.
3844 These end with `break'. */
3851 /* XEmacs: Under Mule, these bit vectors will
3852 only contain values for characters below 0x80. */
3853 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3854 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3859 /* Chars beyond end of map must be allowed. */
3861 for (j = *p * BYTEWIDTH; j < 0x80; j++)
3863 /* And all extended characters must be allowed, too. */
3864 for (j = 0x80; j < 0xA0; j++)
3866 #else /* not MULE */
3867 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3871 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3873 (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3878 case charset_mule: {
3882 nentries = unified_range_table_nentries(p);
3883 for (i = 0; i < nentries; i++) {
3884 EMACS_INT first, last;
3885 Lisp_Object dummy_val;
3887 Bufbyte strr[MAX_EMCHAR_LEN];
3889 unified_range_table_get_range(p, i,
3894 jj <= last && jj < 0x80; jj++)
3896 /* Ranges below 0x100 can span charsets, but there
3897 are only two (Control-1 and Latin-1), and
3898 either first or last has to be in them. */
3899 set_charptr_emchar(strr, first);
3902 set_charptr_emchar(strr, last);
3909 case charset_mule_not: {
3913 nentries = unified_range_table_nentries(p);
3914 for (i = 0; i < nentries; i++) {
3915 EMACS_INT first, last;
3916 Lisp_Object dummy_val;
3918 int smallest_prev = 0;
3920 unified_range_table_get_range(p, i,
3924 for (jj = smallest_prev;
3925 jj < first && jj < 0x80; jj++)
3927 smallest_prev = last + 1;
3928 if (smallest_prev >= 0x80)
3931 /* Calculating which leading bytes are actually allowed
3932 here is rather difficult, so we just punt and allow
3934 for (i = 0x80; i < 0xA0; i++)
3945 for (j = 0; j < (1 << BYTEWIDTH); j++)
3948 (regex_emacs_buffer->mirror_syntax_table),
3957 goto matchnotsyntax;
3959 for (j = 0; j < (1 << BYTEWIDTH); j++)
3962 (regex_emacs_buffer->mirror_syntax_table),
3969 int fastmap_newline = fastmap['\n'];
3971 /* `.' matches anything ... */
3973 /* "anything" only includes bytes that can be the
3974 first byte of a character. */
3975 for (j = 0; j < 0xA0; j++)
3978 for (j = 0; j < (1 << BYTEWIDTH); j++)
3982 /* ... except perhaps newline. */
3983 if (!(bufp->syntax & RE_DOT_NEWLINE))
3984 fastmap['\n'] = fastmap_newline;
3986 /* Return if we have already set `can_be_null'; if we
3987 have, then the fastmap is irrelevant. Something's
3989 else if (bufp->can_be_null)
3992 /* Otherwise, have to check alternative paths. */
4003 /* This match depends on text properties. These end with
4004 aborting optimizations. */
4005 bufp->can_be_null = 1;
4011 for (j = 0; j < 0x80; j++)
4014 (regex_emacs_buffer->mirror_syntax_table),
4015 j) == (enum syntaxcode)k)
4017 for (j = 0x80; j < 0xA0; j++) {
4018 if (LEADING_BYTE_PREFIX_P(j))
4019 /* too complicated to calculate this right */
4025 cset = CHARSET_BY_LEADING_BYTE(j);
4026 if (CHARSETP(cset)) {
4028 (regex_emacs_buffer, cset,
4030 == Sword || multi_p)
4035 #else /* not MULE */
4036 for (j = 0; j < (1 << BYTEWIDTH); j++)
4039 (regex_emacs_buffer->mirror_syntax_table),
4040 j) == (enum syntaxcode)k)
4047 for (j = 0; j < 0x80; j++)
4050 (regex_emacs_buffer->mirror_syntax_table),
4051 j) != (enum syntaxcode)k)
4053 for (j = 0x80; j < 0xA0; j++) {
4054 if (LEADING_BYTE_PREFIX_P(j))
4055 /* too complicated to calculate this right */
4061 cset = CHARSET_BY_LEADING_BYTE(j);
4062 if (CHARSETP(cset)) {
4064 (regex_emacs_buffer, cset,
4066 != Sword || multi_p)
4071 #else /* not MULE */
4072 for (j = 0; j < (1 << BYTEWIDTH); j++)
4075 (regex_emacs_buffer->mirror_syntax_table),
4076 j) != (enum syntaxcode)k)
4083 /* 97/2/17 jhod category patch */
4085 case notcategoryspec:
4086 bufp->can_be_null = 1;
4088 /* end if category patch */
4091 /* All cases after this match the empty string. These
4092 end with `continue'. */
4111 case push_dummy_failure:
4115 case pop_failure_jump:
4116 case maybe_pop_jump:
4119 case dummy_failure_jump:
4120 EXTRACT_NUMBER_AND_INCR(j, p);
4125 /* Jump backward implies we just went through the body
4126 of a loop and matched nothing. Opcode jumped to
4127 should be `on_failure_jump' or `succeed_n'. Just
4128 treat it like an ordinary jump. For a * loop, it has
4129 pushed its failure point already; if so, discard that
4131 if ((re_opcode_t) * p != on_failure_jump
4132 && (re_opcode_t) * p != succeed_n)
4136 EXTRACT_NUMBER_AND_INCR(j, p);
4139 /* If what's on the stack is where we are now, pop
4141 if (!FAIL_STACK_EMPTY()
4142 && fail_stack.stack[fail_stack.avail - 1].pointer ==
4148 case on_failure_jump:
4149 case on_failure_keep_string_jump:
4150 handle_on_failure_jump:
4151 EXTRACT_NUMBER_AND_INCR(j, p);
4153 /* For some patterns, e.g., `(a?)?', `p+j' here points
4154 to the end of the pattern. We don't want to push
4155 such a point, since when we restore it above,
4156 entering the switch will increment `p' past the end
4157 of the pattern. We don't need to push such a point
4158 since we obviously won't find any more fastmap
4159 entries beyond `pend'. Such a pattern can match the
4160 null string, though. */
4162 if (!PUSH_PATTERN_OP(p + j, fail_stack)) {
4167 bufp->can_be_null = 1;
4170 EXTRACT_NUMBER_AND_INCR(k, p); /* Skip the n. */
4171 succeed_n_p = false;
4177 /* Get to the number of times to succeed. */
4180 /* Increment p past the n for when k != 0. */
4181 EXTRACT_NUMBER_AND_INCR(k, p);
4184 succeed_n_p = true; /* Spaghetti code alert. */
4185 goto handle_on_failure_jump;
4200 abort(); /* We have listed all the cases. */
4203 /* Getting here means we have found the possible starting
4204 characters for one path of the pattern -- and that the empty
4205 string does not match. We need not follow this path further.
4206 Instead, look at the next alternative (remembered on the
4207 stack), or quit if no more. The test at the top of the loop
4208 does these things. */
4209 path_can_be_null = false;
4213 /* Set `can_be_null' for the last path (also the first path, if the
4214 pattern is empty). */
4215 bufp->can_be_null |= path_can_be_null;
4220 } /* re_compile_fastmap */
4222 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4223 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4224 this memory for recording register information. STARTS and ENDS
4225 must be allocated using the malloc library routine, and must each
4226 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4228 If NUM_REGS == 0, then subsequent matches should allocate their own
4231 Unless this function is called, the first search or match using
4232 PATTERN_BUFFER will allocate its own register data, without
4233 freeing the old data. */
4236 re_set_registers(struct re_pattern_buffer *bufp, struct re_registers *regs,
4237 unsigned num_regs, regoff_t * starts, regoff_t * ends)
4240 bufp->regs_allocated = REGS_REALLOCATE;
4241 regs->num_regs = num_regs;
4242 regs->start = starts;
4245 bufp->regs_allocated = REGS_UNALLOCATED;
4247 regs->start = regs->end = (regoff_t *) 0;
4251 /* Searching routines. */
4253 /* Like re_search_2, below, but only one string is specified, and
4254 doesn't let you say where to stop matching. */
4257 re_search(struct re_pattern_buffer *bufp, const char *string, int size,
4258 int startpos, int range, struct re_registers *regs)
4260 return re_search_2(bufp, NULL, 0, string, size, startpos, range,
4265 /* Snarfed from src/lisp.h, needed for compiling [ce]tags. */
4266 # define bytecount_to_charcount(ptr, len) (len)
4267 # define charcount_to_bytecount(ptr, len) (len)
4268 typedef int Charcount;
4271 /* Using the compiled pattern in BUFP->buffer, first tries to match the
4272 virtual concatenation of STRING1 and STRING2, starting first at index
4273 STARTPOS, then at STARTPOS + 1, and so on.
4275 With MULE, STARTPOS is a byte position, not a char position. And the
4276 search will increment STARTPOS by the width of the current leading
4279 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
4281 RANGE is how far to scan while trying to match. RANGE = 0 means try
4282 only at STARTPOS; in general, the last start tried is STARTPOS +
4285 With MULE, RANGE is a byte position, not a char position. The last
4286 start tried is the character starting <= STARTPOS + RANGE.
4288 In REGS, return the indices of the virtual concatenation of STRING1
4289 and STRING2 that matched the entire BUFP->buffer and its contained
4292 Do not consider matching one past the index STOP in the virtual
4293 concatenation of STRING1 and STRING2.
4295 We return either the position in the strings at which the match was
4296 found, -1 if no match, or -2 if error (such as failure
4300 re_search_2(struct re_pattern_buffer *bufp, const char *str1,
4301 int size1, const char *str2, int size2, int startpos,
4302 int range, struct re_registers *regs, int stop)
4305 re_char *string1 = (re_char *) str1;
4306 re_char *string2 = (re_char *) str2;
4307 REGISTER char *fastmap = bufp->fastmap;
4308 REGISTER RE_TRANSLATE_TYPE translate = bufp->translate;
4309 int total_size = size1 + size2;
4310 int endpos = startpos + range;
4311 #ifdef REGEX_BEGLINE_CHECK
4312 int anchored_at_begline = 0;
4317 /* Check for out-of-range STARTPOS. */
4318 if (startpos < 0 || startpos > total_size)
4321 /* Fix up RANGE if it might eventually take us outside
4322 the virtual concatenation of STRING1 and STRING2. */
4324 range = 0 - startpos;
4325 else if (endpos > total_size)
4326 range = total_size - startpos;
4328 /* If the search isn't to be a backwards one, don't waste time in a
4329 search for a pattern that must be anchored. */
4330 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf
4335 d = ((const unsigned char *)
4337 size1 ? string2 - size1 : string1) + startpos);
4338 range = charcount_to_bytecount(d, 1);
4342 /* In a forward search for something that starts with \=.
4343 don't keep searching past point. */
4344 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot
4347 BUF_PT(regex_emacs_buffer) - BUF_BEGV(regex_emacs_buffer)
4354 /* Update the fastmap now if not correct already. */
4355 if (fastmap && !bufp->fastmap_accurate)
4356 if (re_compile_fastmap(bufp) == -2)
4359 #ifdef REGEX_BEGLINE_CHECK
4361 unsigned long i = 0;
4363 while (i < bufp->used) {
4364 if (bufp->buffer[i] == start_memory ||
4365 bufp->buffer[i] == stop_memory)
4370 anchored_at_begline = i < bufp->used
4371 && bufp->buffer[i] == begline;
4376 SETUP_SYNTAX_CACHE_FOR_OBJECT(regex_match_object,
4378 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR
4379 (regex_match_object, regex_emacs_buffer,
4383 /* Loop through the string, looking for a place to start matching. */
4385 #ifdef REGEX_BEGLINE_CHECK
4386 /* If the regex is anchored at the beginning of a line (i.e. with a ^),
4387 then we can speed things up by skipping to the next beginning-of-
4389 if (anchored_at_begline && startpos > 0 && startpos != size1 &&
4391 /* whose stupid idea was it anyway to make this
4392 function take two strings to match?? */
4396 if (startpos < size1 && startpos + range >= size1)
4397 lim = range - (size1 - startpos);
4399 d = ((const unsigned char *)
4401 size1 ? string2 - size1 : string1) + startpos);
4402 DEC_CHARPTR(d); /* Ok, since startpos != size1. */
4403 d_size = charcount_to_bytecount(d, 1);
4405 if (TRANSLATE_P(translate))
4406 while (range > lim && *d != '\n') {
4407 d += d_size; /* Speedier INC_CHARPTR(d) */
4408 d_size = charcount_to_bytecount(d, 1);
4411 while (range > lim && *d != '\n') {
4412 d += d_size; /* Speedier INC_CHARPTR(d) */
4413 d_size = charcount_to_bytecount(d, 1);
4417 startpos += irange - range;
4419 #endif /* REGEX_BEGLINE_CHECK */
4421 /* If a fastmap is supplied, skip quickly over characters that
4422 cannot be the start of a match. If the pattern can match the
4423 null string, however, we don't need to skip characters; we want
4424 the first null string. */
4425 if (fastmap && startpos < total_size && !bufp->can_be_null) {
4426 if (range > 0) { /* Searching forwards. */
4430 if (startpos < size1
4431 && startpos + range >= size1)
4432 lim = range - (size1 - startpos);
4434 d = ((const unsigned char *)
4436 size1 ? string2 - size1 : string1) +
4439 /* Written out as an if-else to avoid testing `translate'
4441 if (TRANSLATE_P(translate))
4442 while (range > lim) {
4445 Bufbyte str[MAX_EMCHAR_LEN];
4447 buf_ch = charptr_emchar(d);
4448 buf_ch = RE_TRANSLATE(buf_ch);
4449 set_charptr_emchar(str, buf_ch);
4451 || fastmap[(unsigned char)
4461 charcount_to_bytecount(d,
4464 d += d_size; /* Speedier INC_CHARPTR(d) */
4466 while (range > lim && !fastmap[*d]) {
4468 charcount_to_bytecount(d,
4471 d += d_size; /* Speedier INC_CHARPTR(d) */
4474 startpos += irange - range;
4475 } else { /* Searching backwards. */
4477 Emchar c = (size1 == 0 || startpos >= size1
4478 ? charptr_emchar(string2 +
4480 : charptr_emchar(string1 +
4484 if (!(c >= 0200 || fastmap[(unsigned char)c]))
4487 if (!fastmap[(unsigned char)c])
4493 /* If can't match the null string, and that's all we have left, fail. */
4494 if (range >= 0 && startpos == total_size && fastmap
4495 && !bufp->can_be_null)
4498 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4499 if (!no_quit_in_re_search)
4502 val = re_match_2_internal(bufp, string1, size1, string2, size2,
4503 startpos, regs, stop);
4504 #ifndef REGEX_MALLOC
4519 else if (range > 0) {
4520 d = ((const unsigned char *)
4522 size1 ? string2 - size1 : string1) + startpos);
4523 d_size = charcount_to_bytecount(d, 1);
4527 /* Note startpos > size1 not >=. If we are on the
4528 string1/string2 boundary, we want to backup into string1. */
4529 d = ((const unsigned char *)
4531 size1 ? string2 - size1 : string1) + startpos);
4533 d_size = charcount_to_bytecount(d, 1);
4541 /* Declarations and macros for re_match_2. */
4543 /* This converts PTR, a pointer into one of the search strings `string1'
4544 and `string2' into an offset from the beginning of that string. */
4545 #define POINTER_TO_OFFSET(ptr) \
4546 (FIRST_STRING_P (ptr) \
4547 ? ((regoff_t) ((ptr) - string1)) \
4548 : ((regoff_t) ((ptr) - string2 + size1)))
4550 /* Macros for dealing with the split strings in re_match_2. */
4552 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
4554 /* Call before fetching a character with *d. This switches over to
4555 string2 if necessary. */
4556 #define REGEX_PREFETCH() \
4559 /* End of string2 => fail. */ \
4560 if (dend == end_match_2) \
4562 /* End of string1 => advance to string2. */ \
4564 dend = end_match_2; \
4567 /* Test if at very beginning or at very end of the virtual concatenation
4568 of `string1' and `string2'. If only one string, it's `string2'. */
4569 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4570 #define AT_STRINGS_END(d) ((d) == end2)
4573 If the given position straddles the string gap, return the equivalent
4574 position that is before or after the gap, respectively; otherwise,
4575 return the same position. */
4576 #define POS_BEFORE_GAP_UNSAFE(d) ((d) == string2 ? end1 : (d))
4577 #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d))
4579 /* Test if CH is a word-constituent character. (XEmacs change) */
4580 #define WORDCHAR_P_UNSAFE(ch) \
4581 (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), \
4584 /* Free everything we malloc. */
4585 #ifdef MATCH_MAY_ALLOCATE
4586 #define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
4587 #define FREE_VARIABLES() \
4589 REGEX_FREE_STACK (fail_stack.stack); \
4590 FREE_VAR (regstart); \
4591 FREE_VAR (regend); \
4592 FREE_VAR (old_regstart); \
4593 FREE_VAR (old_regend); \
4594 FREE_VAR (best_regstart); \
4595 FREE_VAR (best_regend); \
4596 FREE_VAR (reg_info); \
4597 FREE_VAR (reg_dummy); \
4598 FREE_VAR (reg_info_dummy); \
4600 #else /* not MATCH_MAY_ALLOCATE */
4601 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4602 #endif /* MATCH_MAY_ALLOCATE */
4604 /* These values must meet several constraints. They must not be valid
4605 register values; since we have a limit of 255 registers (because
4606 we use only one byte in the pattern for the register number), we can
4607 use numbers larger than 255. They must differ by 1, because of
4608 NUM_FAILURE_ITEMS above. And the value for the lowest register must
4609 be larger than the value for the highest register, so we do not try
4610 to actually save any registers when none are active. */
4611 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
4612 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
4614 /* Matching routines. */
4616 #ifndef emacs /* Emacs never uses this. */
4617 /* re_match is like re_match_2 except it takes only a single string. */
4620 re_match(struct re_pattern_buffer *bufp, const char *string, int size,
4621 int pos, struct re_registers *regs)
4624 re_match_2_internal(bufp, NULL, 0, (re_char *) string, size,
4629 #endif /* not emacs */
4631 /* re_match_2 matches the compiled pattern in BUFP against the
4632 (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 and
4633 SIZE2, respectively). We start matching at POS, and stop matching
4636 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4637 store offsets for the substring each group matched in REGS. See the
4638 documentation for exactly how many groups we fill.
4640 We return -1 if no match, -2 if an internal error (such as the
4641 failure stack overflowing). Otherwise, we return the length of the
4642 matched substring. */
4645 re_match_2(struct re_pattern_buffer *bufp, const char *string1,
4646 int size1, const char *string2, int size2, int pos,
4647 struct re_registers *regs, int stop)
4652 SETUP_SYNTAX_CACHE_FOR_OBJECT(regex_match_object,
4654 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR
4655 (regex_match_object, regex_emacs_buffer,
4659 result = re_match_2_internal(bufp, (re_char *) string1, size1,
4660 (re_char *) string2, size2,
4667 /* This is a separate function so that we can force an alloca cleanup
4670 re_match_2_internal(struct re_pattern_buffer *bufp, re_char * string1,
4671 int size1, re_char * string2, int size2, int pos,
4672 struct re_registers *regs, int stop)
4674 /* General temporaries. */
4677 int should_succeed; /* XEmacs change */
4679 /* Just past the end of the corresponding string. */
4680 re_char *end1, *end2;
4682 /* Pointers into string1 and string2, just past the last characters in
4683 each to consider matching. */
4684 re_char *end_match_1, *end_match_2;
4686 /* Where we are in the data, and the end of the current string. */
4689 /* Where we are in the pattern, and the end of the pattern. */
4690 unsigned char *p = bufp->buffer;
4691 REGISTER unsigned char *pend = p + bufp->used;
4693 /* Mark the opcode just after a start_memory, so we can test for an
4694 empty subpattern when we get to the stop_memory. */
4695 re_char *just_past_start_mem = 0;
4697 /* We use this to map every character in the string. */
4698 RE_TRANSLATE_TYPE translate = bufp->translate;
4700 /* Failure point stack. Each place that can handle a failure further
4701 down the line pushes a failure point on this stack. It consists of
4702 restart, regend, and reg_info for all registers corresponding to
4703 the subexpressions we're currently inside, plus the number of such
4704 registers, and, finally, two char *'s. The first char * is where
4705 to resume scanning the pattern; the second one is where to resume
4706 scanning the strings. If the latter is zero, the failure point is
4707 a ``dummy''; if a failure happens and the failure point is a dummy,
4708 it gets discarded and the next one is tried. */
4709 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4710 fail_stack_type fail_stack;
4712 #ifdef REGEX_DEBUG_FLAG
4713 static unsigned failure_id;
4714 int nfailure_points_pushed = 0, nfailure_points_popped = 0;
4718 /* This holds the pointer to the failure stack, when
4719 it is allocated relocatably. */
4720 fail_stack_elt_t *failure_stack_ptr;
4723 /* We fill all the registers internally, independent of what we
4724 return, for use in backreferences. The number here includes
4725 an element for register zero. */
4726 int num_regs = bufp->re_ngroups + 1;
4728 /* The currently active registers. */
4729 int lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4730 int highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4732 /* Information on the contents of registers. These are pointers into
4733 the input strings; they record just what was matched (on this
4734 attempt) by a subexpression part of the pattern, that is, the
4735 regnum-th regstart pointer points to where in the pattern we began
4736 matching and the regnum-th regend points to right after where we
4737 stopped matching the regnum-th subexpression. (The zeroth register
4738 keeps track of what the whole pattern matches.) */
4739 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4740 re_char **regstart, **regend;
4743 /* If a group that's operated upon by a repetition operator fails to
4744 match anything, then the register for its start will need to be
4745 restored because it will have been set to wherever in the string we
4746 are when we last see its open-group operator. Similarly for a
4748 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4749 re_char **old_regstart, **old_regend;
4752 /* The is_active field of reg_info helps us keep track of which (possibly
4753 nested) subexpressions we are currently in. The matched_something
4754 field of reg_info[reg_num] helps us tell whether or not we have
4755 matched any of the pattern so far this time through the reg_num-th
4756 subexpression. These two fields get reset each time through any
4757 loop their register is in. */
4758 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4759 register_info_type *reg_info;
4762 /* The following record the register info as found in the above
4763 variables when we find a match better than any we've seen before.
4764 This happens as we backtrack through the failure points, which in
4765 turn happens only if we have not yet matched the entire string. */
4766 unsigned best_regs_set = false;
4767 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4768 re_char **best_regstart, **best_regend;
4771 /* Logically, this is `best_regend[0]'. But we don't want to have to
4772 allocate space for that if we're not allocating space for anything
4773 else (see below). Also, we never need info about register 0 for
4774 any of the other register vectors, and it seems rather a kludge to
4775 treat `best_regend' differently than the rest. So we keep track of
4776 the end of the best match so far in a separate variable. We
4777 initialize this to NULL so that when we backtrack the first time
4778 and need to test it, it's not garbage. */
4779 re_char *match_end = NULL;
4781 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
4782 int set_regs_matched_done = 0;
4784 /* Used when we pop values we don't care about. */
4785 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4786 re_char **reg_dummy;
4787 register_info_type *reg_info_dummy;
4790 #ifdef REGEX_DEBUG_FLAG
4791 /* Counts the total number of registers pushed. */
4792 unsigned num_regs_pushed = 0;
4795 /* 1 if this match ends in the same string (string1 or string2)
4796 as the best previous match. */
4799 /* 1 if this match is the best seen so far. */
4800 re_bool best_match_p;
4802 DEBUG_PRINT1("\n\nEntering re_match_2.\n");
4806 #ifdef MATCH_MAY_ALLOCATE
4807 /* Do not bother to initialize all the register variables if there are
4808 no groups in the pattern, as it takes a fair amount of time. If
4809 there are groups, we include space for register 0 (the whole
4810 pattern), even though we never use it, since it simplifies the
4811 array indexing. We should fix this. */
4812 if (bufp->re_ngroups) {
4813 regstart = REGEX_TALLOC(num_regs, re_char *);
4814 regend = REGEX_TALLOC(num_regs, re_char *);
4815 old_regstart = REGEX_TALLOC(num_regs, re_char *);
4816 old_regend = REGEX_TALLOC(num_regs, re_char *);
4817 best_regstart = REGEX_TALLOC(num_regs, re_char *);
4818 best_regend = REGEX_TALLOC(num_regs, re_char *);
4819 reg_info = REGEX_TALLOC(num_regs, register_info_type);
4820 reg_dummy = REGEX_TALLOC(num_regs, re_char *);
4821 reg_info_dummy = REGEX_TALLOC(num_regs, register_info_type);
4824 (regstart && regend && old_regstart && old_regend
4825 && reg_info && best_regstart && best_regend && reg_dummy
4826 && reg_info_dummy)) {
4831 /* We must initialize all our variables to NULL, so that
4832 `FREE_VARIABLES' doesn't try to free them. */
4833 regstart = regend = old_regstart = old_regend = best_regstart
4834 = best_regend = reg_dummy = NULL;
4835 reg_info = reg_info_dummy = (register_info_type *) NULL;
4837 #endif /* MATCH_MAY_ALLOCATE */
4839 /* The starting position is bogus. */
4840 if (pos < 0 || pos > size1 + size2) {
4845 /* Initialize subexpression text positions to -1 to mark ones that no
4846 start_memory/stop_memory has been seen for. Also initialize the
4847 register information struct. */
4848 for (mcnt = 1; mcnt < num_regs; mcnt++) {
4849 regstart[mcnt] = regend[mcnt]
4850 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
4852 REG_MATCH_NULL_STRING_P(reg_info[mcnt]) =
4853 MATCH_NULL_UNSET_VALUE;
4854 IS_ACTIVE(reg_info[mcnt]) = 0;
4855 MATCHED_SOMETHING(reg_info[mcnt]) = 0;
4856 EVER_MATCHED_SOMETHING(reg_info[mcnt]) = 0;
4858 /* We move `string1' into `string2' if the latter's empty -- but not if
4859 `string1' is null. */
4860 if (size2 == 0 && string1 != NULL) {
4866 end1 = string1 + size1;
4867 end2 = string2 + size2;
4869 /* Compute where to stop matching, within the two strings. */
4870 if (stop <= size1) {
4871 end_match_1 = string1 + stop;
4872 end_match_2 = string2;
4875 end_match_2 = string2 + stop - size1;
4878 /* `p' scans through the pattern as `d' scans through the data.
4879 `dend' is the end of the input string that `d' points within. `d'
4880 is advanced into the following input string whenever necessary, but
4881 this happens before fetching; therefore, at the beginning of the
4882 loop, `d' can be pointing at the end of a string, but it cannot
4884 if (size1 > 0 && pos <= size1) {
4888 d = string2 + pos - size1;
4892 DEBUG_PRINT1("The compiled pattern is: \n");
4893 DEBUG_PRINT_COMPILED_PATTERN(bufp, p, pend);
4894 DEBUG_PRINT1("The string to match is: `");
4895 DEBUG_PRINT_DOUBLE_STRING(d, string1, size1, string2, size2);
4896 DEBUG_PRINT1("'\n");
4898 /* This loops over pattern commands. It exits by returning from the
4899 function if the match is complete, or it drops through if the match
4900 fails at this starting point in the input data. */
4902 DEBUG_PRINT2("\n0x%lx: ", (long)p);
4903 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4904 if (!no_quit_in_re_search)
4909 /* End of pattern means we might have succeeded. */
4910 DEBUG_PRINT1("end of pattern ... ");
4912 /* If we haven't matched the entire string, and we want
4913 the longest match, try backtracking. */
4914 if (d != end_match_2) {
4915 same_str_p = (FIRST_STRING_P(match_end)
4916 == MATCHING_IN_FIRST_STRING);
4918 /* AIX compiler got confused when this was
4919 combined with the previous declaration. */
4921 best_match_p = d > match_end;
4924 !MATCHING_IN_FIRST_STRING;
4926 DEBUG_PRINT1("backtracking.\n");
4928 if (!FAIL_STACK_EMPTY()) {
4929 /* More failure points to try. */
4931 /* If exceeds best match so far, save
4933 if (!best_regs_set || best_match_p) {
4934 best_regs_set = true;
4938 ("\nSAVING match as best so far.\n");
4940 for (mcnt = 1; mcnt < num_regs;
4942 best_regstart[mcnt] =
4951 /* If no failure points, don't restore garbage.
4952 And if last match is real best match, don't
4953 restore second best one. */
4954 else if (best_regs_set && !best_match_p) {
4956 /* Restore best match. It may happen
4957 that `dend == end_match_1' while the
4958 restored d is in string2. For
4959 example, the pattern `x.*y.*z'
4960 against the strings `x-' and `y-z-',
4961 if the two strings are not
4962 consecutive in memory. */
4964 ("Restoring best registers.\n");
4967 dend = ((d >= string1 && d <= end1)
4968 ? end_match_1 : end_match_2);
4970 for (mcnt = 1; mcnt < num_regs; mcnt++) {
4972 best_regstart[mcnt];
4978 /* d != end_match_2 */
4980 DEBUG_PRINT1("Accepting match.\n");
4982 int num_nonshy_regs = bufp->re_nsub + 1;
4983 /* If caller wants register contents data back,
4985 if (regs && !bufp->no_sub) {
4986 /* Have the register data arrays been
4988 if (bufp->regs_allocated == REGS_UNALLOCATED) {
4989 /* No. So allocate them with malloc.
4990 We need one extra element beyond
4991 `num_regs' for the `-1' marker GNU
4993 regs->num_regs = MAX(RE_NREGS, num_nonshy_regs + 1);
4994 regs->start = TALLOC(regs->num_regs, regoff_t);
4995 regs->end = TALLOC(regs->num_regs, regoff_t);
4996 if (regs->start == NULL || regs->end == NULL) {
5000 bufp->regs_allocated = REGS_REALLOCATE;
5001 } else if (bufp->regs_allocated == REGS_REALLOCATE) {
5002 /* Yes. If we need more elements than were already
5003 allocated, reallocate them. If we need fewer, just
5005 if (regs->num_regs < num_nonshy_regs + 1) {
5006 regs->num_regs = num_nonshy_regs + 1;
5007 RETALLOC(regs->start, regs->num_regs, regoff_t);
5008 RETALLOC(regs->end, regs->num_regs, regoff_t);
5009 if (regs->start == NULL || regs->end == NULL) {
5015 /* The braces fend off a "empty body in an else-statement"
5016 warning under GCC when assert expands to nothing. */
5017 assert (bufp->regs_allocated == REGS_FIXED);
5020 /* Convert the pointer data in `regstart' and `regend' to
5021 indices. Register zero has to be set differently,
5022 since we haven't kept track of any info for it. */
5023 if (regs->num_regs > 0) {
5024 regs->start[0] = pos;
5025 regs->end[0] = (MATCHING_IN_FIRST_STRING
5026 ? ((regoff_t) (d - string1))
5027 : ((regoff_t) (d - string2 + size1)));
5030 /* Map over the NUM_NONSHY_REGS non-shy internal registers.
5031 Copy each into the corresponding external register.
5032 N.B. MCNT indexes external registers. */
5034 mcnt < MIN (num_nonshy_regs, regs->num_regs);
5036 int ireg = bufp->external_to_internal_register[mcnt];
5038 if (REG_UNSET (regstart[ireg]) || REG_UNSET (regend[ireg]))
5039 regs->start[mcnt] = regs->end[mcnt] = -1;
5042 = (regoff_t) POINTER_TO_OFFSET (regstart[ireg]);
5044 = (regoff_t) POINTER_TO_OFFSET (regend[ireg]);
5047 } /* regs && !bufp->no_sub */
5049 /* If we have regs and the regs structure has
5050 more elements than were in the pattern, set
5051 the extra elements to -1. If we
5052 (re)allocated the registers, this is the
5053 case, because we always allocate enough to
5054 have at least one -1 at the end.
5056 We do this even when no_sub is set because
5057 some applications (XEmacs) reuse register
5058 structures which may contain stale
5059 information, and permit attempts to access
5062 It would be possible to require the caller to
5063 do this, but we'd have to change the API for
5064 this function to reflect that, and audit all
5066 if (regs && regs->num_regs > 0)
5067 for (mcnt = num_nonshy_regs; mcnt < regs->num_regs; mcnt++)
5068 regs->start[mcnt] = regs->end[mcnt] = -1;
5071 DEBUG_PRINT4("%u failure points pushed, %u popped (%u remain).\n",
5072 nfailure_points_pushed, nfailure_points_popped,
5073 nfailure_points_pushed - nfailure_points_popped);
5074 DEBUG_PRINT2("%u registers pushed.\n", num_regs_pushed);
5076 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
5080 DEBUG_PRINT2("Returning %d from re_match_2.\n", mcnt);
5086 /* Otherwise match next pattern command. */
5087 switch (SWITCH_ENUM_CAST((re_opcode_t) *p++)) {
5088 /* Ignore these. Used to ignore the n of succeed_n's
5089 which currently have n == 0. */
5091 DEBUG_PRINT1("EXECUTING no_op.\n");
5095 DEBUG_PRINT1("EXECUTING succeed.\n");
5098 /* Match the next n pattern characters exactly. The
5099 following byte in the pattern defines n, and the n
5100 bytes after that are the characters to match. */
5103 DEBUG_PRINT2("EXECUTING exactn %d.\n", mcnt);
5105 /* This is written out as an if-else so we don't waste
5106 time testing `translate' inside the loop. */
5107 if (TRANSLATE_P(translate)) {
5110 Emchar pat_ch, buf_ch;
5114 pat_ch = charptr_emchar(p);
5115 buf_ch = charptr_emchar(d);
5116 if (RE_TRANSLATE(buf_ch) != pat_ch)
5119 pat_len = charcount_to_bytecount(p, 1);
5124 #else /* not MULE */
5126 if ((unsigned char)RE_TRANSLATE(*d++) !=
5144 /* Match any character except possibly a newline or a
5147 DEBUG_PRINT1("EXECUTING anychar.\n");
5151 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5152 && TRANSLATE(*d) == '\n')
5153 || (bufp->syntax & RE_DOT_NOT_NULL
5154 && TRANSLATE(*d) == '\000'))
5158 DEBUG_PRINT2(" Matched `%d'.\n", *d);
5159 INC_CHARPTR(d); /* XEmacs change */
5164 REGISTER unsigned char c;
5166 (re_opcode_t) * (p - 1) == charset_not;
5168 DEBUG_PRINT2("EXECUTING charset%s.\n",
5169 not_p ? "_not" : "");
5172 /* The character to match. */
5175 /* Cast to `unsigned' instead of `unsigned char'
5176 in case the bit list is a full 32 bytes
5178 if (c < (unsigned)(*p * BYTEWIDTH)
5180 c / BYTEWIDTH] & (1 << (c %
5190 INC_CHARPTR(d); /* XEmacs change */
5196 case charset_mule_not: {
5199 (re_opcode_t) * (p - 1) == charset_mule_not;
5201 DEBUG_PRINT2("EXECUTING charset_mule%s.\n",
5202 not_p ? "_not" : "");
5205 c = charptr_emchar((const Bufbyte *)d);
5206 /* The character to match. */
5211 unified_range_table_lookup(p, c, Qnil)))
5214 p += unified_range_table_bytes_used(p);
5225 /* The beginning of a group is represented by
5226 start_memory. The arguments are the register number
5227 in the next byte, and the number of groups inner to
5228 this one in the next. The text matched within the
5229 group is recorded (in the internal registers data
5230 structure) under the register number. */
5232 DEBUG_PRINT3("EXECUTING start_memory %d (%d):\n", *p,
5235 /* Find out if this group can match the empty string. */
5236 p1 = p; /* To send to group_match_null_string_p. */
5238 if (REG_MATCH_NULL_STRING_P(reg_info[*p]) ==
5239 MATCH_NULL_UNSET_VALUE)
5240 REG_MATCH_NULL_STRING_P(reg_info[*p])
5241 = group_match_null_string_p(&p1, pend,
5244 /* Save the position in the string where we were the
5245 last time we were at this open-group operator in case
5246 the group is operated upon by a repetition operator,
5247 e.g., with `(a*)*b' against `ab'; then we want to
5248 ignore where we are now in the string in case this
5249 attempt to match fails. */
5250 old_regstart[*p] = REG_MATCH_NULL_STRING_P(reg_info[*p])
5251 ? REG_UNSET(regstart[*p]) ? d : regstart[*p]
5253 DEBUG_PRINT2(" old_regstart: %d\n",
5254 POINTER_TO_OFFSET(old_regstart[*p]));
5257 DEBUG_PRINT2(" regstart: %d\n",
5258 POINTER_TO_OFFSET(regstart[*p]));
5260 IS_ACTIVE(reg_info[*p]) = 1;
5261 MATCHED_SOMETHING(reg_info[*p]) = 0;
5263 /* Clear this whenever we change the register activity
5265 set_regs_matched_done = 0;
5267 /* This is the new highest active register. */
5268 highest_active_reg = *p;
5270 /* If nothing was active before, this is the new lowest
5272 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5273 lowest_active_reg = *p;
5275 /* Move past the register number and inner group
5278 just_past_start_mem = p;
5282 /* The stop_memory opcode represents the end of a group.
5283 Its arguments are the same as start_memory's: the
5284 register number, and the number of inner groups. */
5286 DEBUG_PRINT3("EXECUTING stop_memory %d (%d):\n", *p,
5289 /* We need to save the string position the last time we
5290 were at this close-group operator in case the group
5291 is operated upon by a repetition operator, e.g., with
5292 `((a*)*(b*)*)*' against `aba'; then we want to ignore
5293 where we are now in the string in case this attempt
5295 old_regend[*p] = REG_MATCH_NULL_STRING_P(reg_info[*p])
5296 ? REG_UNSET(regend[*p]) ? d : regend[*p]
5298 DEBUG_PRINT2(" old_regend: %d\n",
5299 POINTER_TO_OFFSET(old_regend[*p]));
5302 DEBUG_PRINT2(" regend: %d\n",
5303 POINTER_TO_OFFSET(regend[*p]));
5305 /* This register isn't active anymore. */
5306 IS_ACTIVE(reg_info[*p]) = 0;
5308 /* Clear this whenever we change the register activity
5310 set_regs_matched_done = 0;
5312 /* If this was the only register active, nothing is
5314 if (lowest_active_reg == highest_active_reg) {
5315 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5316 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5317 } else { /* We must scan for the new highest
5318 active register, since it isn't
5319 necessarily one less than now:
5320 consider (a(b)c(d(e)f)g). When group
5321 3 ends, after the f), the new highest
5322 active register is 1. */
5323 unsigned char r = *p - 1;
5324 while (r > 0 && !IS_ACTIVE(reg_info[r]))
5327 /* If we end up at register zero, that means
5328 that we saved the registers as the result of
5329 an `on_failure_jump', not a `start_memory',
5330 and we jumped to past the innermost
5331 `stop_memory'. For example, in ((.)*) we
5332 save registers 1 and 2 as a result of the *,
5333 but when we pop back to the second ), we are
5334 at the stop_memory 1. Thus, nothing is
5338 NO_LOWEST_ACTIVE_REG;
5339 highest_active_reg =
5340 NO_HIGHEST_ACTIVE_REG;
5342 highest_active_reg = r;
5344 /* 98/9/21 jhod: We've also gotta set
5345 lowest_active_reg, don't we? */
5347 while (r < highest_active_reg
5348 && !IS_ACTIVE(reg_info[r]))
5350 lowest_active_reg = r;
5354 /* If just failed to match something this time around
5355 with a group that's operated on by a repetition
5356 operator, try to force exit from the ``loop'', and
5357 restore the register information for this group that
5358 we had before trying this last match. */
5359 if ((!MATCHED_SOMETHING(reg_info[*p])
5360 || just_past_start_mem == p - 1)
5361 && (p + 2) < pend) {
5362 re_bool is_a_jump_n = false;
5366 switch ((unsigned int)(re_opcode_t)*p1++) {
5369 case pop_failure_jump:
5370 case maybe_pop_jump:
5372 case dummy_failure_jump:
5373 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
5383 /* If the next operation is a jump backwards in
5384 the pattern to an on_failure_jump right
5385 before the start_memory corresponding to this
5386 stop_memory, exit from the loop by forcing a
5387 failure after pushing on the stack the
5388 on_failure_jump's jump in the pattern, and
5391 && (re_opcode_t) * p1 == on_failure_jump
5392 && (re_opcode_t) p1[3] == start_memory
5394 /* If this group ever matched anything,
5395 then restore what its registers were
5396 before trying this last failed match,
5397 e.g., with `(a*)*b' against `ab' for
5398 regstart[1], and, e.g., with
5399 `((a*)*(b*)*)*' against `aba' for
5402 Also restore the registers for inner
5403 groups for, e.g., `((a*)(b*))*'
5404 against `aba' (register 3 would
5405 otherwise get trashed). */
5407 if (EVER_MATCHED_SOMETHING
5411 EVER_MATCHED_SOMETHING(reg_info
5415 /* Restore this and inner
5418 for (r = *p; r < *p + *(p + 1);
5423 /* xx why this test? */
5424 if (old_regend[r] >=
5432 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
5433 PUSH_FAILURE_POINT(p1 + mcnt, d, -2);
5439 /* Move past the register number and the inner group
5444 /* \<digit> has been turned into a `duplicate' command
5445 which is followed by the numeric value of <digit> as
5446 the register number. (Already passed through
5447 external-to-internal-register mapping, so it refers
5448 to the actual group number, not the non-shy-only
5449 numbering used in the external world.) */
5452 REGISTER re_char *d2, *dend2;
5453 /* Get which register to match against. */
5455 DEBUG_PRINT2("EXECUTING duplicate %d.\n",
5458 /* Can't back reference a group which we've
5460 if (REG_UNSET(regstart[regno])
5461 || REG_UNSET(regend[regno]))
5464 /* Where in input to try to start matching. */
5465 d2 = regstart[regno];
5467 /* Where to stop matching; if both the place to
5468 start and the place to stop matching are in
5469 the same string, then set to the place to
5470 stop, otherwise, for now have to use the end
5471 of the first string. */
5473 dend2 = ((FIRST_STRING_P(regstart[regno])
5474 == FIRST_STRING_P(regend[regno]))
5475 ? regend[regno] : end_match_1);
5477 /* If necessary, advance to next segment
5478 in register contents. */
5479 while (d2 == dend2) {
5480 if (dend2 == end_match_2)
5482 if (dend2 == regend[regno])
5485 /* End of string1 => advance to
5488 dend2 = regend[regno];
5490 /* At end of register contents =>
5495 /* If necessary, advance to next segment
5499 /* How many characters left in this segment to match. */
5502 /* Want how many consecutive characters
5503 we can match in one shot, so, if
5504 necessary, adjust the count. */
5505 if (mcnt > dend2 - d2)
5508 /* Compare that many; failure if
5509 mismatch, else move past them. */
5510 if (TRANSLATE_P(translate)
5512 (const unsigned char*)d,
5513 (const unsigned char*)d2,
5515 : memcmp(d, d2, mcnt)) {
5518 d += mcnt, d2 += mcnt;
5520 /* Do this because we've match some
5527 /* begline matches the empty string at the beginning of
5528 the string (unless `not_bol' is set in `bufp'), and,
5529 if `newline_anchor' is set, after newlines. */
5531 DEBUG_PRINT1("EXECUTING begline.\n");
5533 if (AT_STRINGS_BEG(d)) {
5536 } else if (d[-1] == '\n' && bufp->newline_anchor) {
5539 /* In all other cases, we fail. */
5542 /* endline is the dual of begline. */
5544 DEBUG_PRINT1("EXECUTING endline.\n");
5546 if (AT_STRINGS_END(d)) {
5551 /* We have to ``prefetch'' the next character. */
5552 else if ((d == end1 ? *string2 : *d) == '\n'
5553 && bufp->newline_anchor) {
5558 /* Match at the very beginning of the data. */
5560 DEBUG_PRINT1("EXECUTING begbuf.\n");
5561 if (AT_STRINGS_BEG(d))
5565 /* Match at the very end of the data. */
5567 DEBUG_PRINT1("EXECUTING endbuf.\n");
5568 if (AT_STRINGS_END(d))
5572 /* on_failure_keep_string_jump is used to optimize
5573 `.*\n'. It pushes NULL as the value for the string
5574 on the stack. Then `pop_failure_point' will keep the
5575 current value for the string, instead of restoring
5576 it. To see why, consider matching `foo\nbar' against
5577 `.*\n'. The .* matches the foo; then the . fails
5578 against the \n. But the next thing we want to do is
5579 match the \n against the \n; if we restored the
5580 string value, we would be back at the foo.
5582 Because this is used only in specific cases, we don't
5583 need to check all the things that `on_failure_jump'
5584 does, to make sure the right things get saved on the
5585 stack. Hence we don't share its code. The only
5586 reason to push anything on the stack at all is that
5587 otherwise we would have to change `anychar's code to
5588 do something besides goto fail in this case; that
5589 seems worse than this. */
5590 case on_failure_keep_string_jump:
5591 DEBUG_PRINT1("EXECUTING on_failure_keep_string_jump");
5593 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5594 DEBUG_PRINT3(" %d (to 0x%lx):\n", mcnt,
5597 PUSH_FAILURE_POINT(p + mcnt, (unsigned char *)0, -2);
5600 /* Uses of on_failure_jump:
5602 Each alternative starts with an on_failure_jump that
5603 points to the beginning of the next alternative.
5604 Each alternative except the last ends with a jump
5605 that in effect jumps past the rest of the
5606 alternatives. (They really jump to the ending jump
5607 of the following alternative, because tensioning
5608 these jumps is a hassle.)
5610 Repeats start with an on_failure_jump that points
5611 past both the repetition text and either the
5612 following jump or pop_failure_jump back to this
5614 case on_failure_jump:
5616 DEBUG_PRINT1("EXECUTING on_failure_jump");
5618 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5619 DEBUG_PRINT3(" %d (to 0x%lx)", mcnt, (long)(p + mcnt));
5621 /* If this on_failure_jump comes right before a group
5622 (i.e., the original * applied to a group), save the
5623 information for that group and all inner ones, so
5624 that if we fail back to this point, the group's
5625 information will be correct. For example, in
5626 \(a*\)*\1, we need the preceding group, and in
5627 \(\(a*\)b*\)\2, we need the inner group. */
5629 /* We can't use `p' to check ahead because we push
5630 a failure point to `p + mcnt' after we do this. */
5633 /* We need to skip no_op's before we look for the
5634 start_memory in case this on_failure_jump is
5635 happening as the result of a completed succeed_n, as
5636 in \(a\)\{1,3\}b\1 against aba. */
5637 while (p1 < pend && (re_opcode_t) * p1 == no_op)
5640 if (p1 < pend && (re_opcode_t) * p1 == start_memory) {
5641 /* We have a new highest active register now.
5642 This will get reset at the start_memory we
5643 are about to get to, but we will have saved
5644 all the registers relevant to this repetition
5645 op, as described above. */
5646 highest_active_reg = *(p1 + 1) + *(p1 + 2);
5647 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5648 lowest_active_reg = *(p1 + 1);
5651 DEBUG_PRINT1(":\n");
5652 PUSH_FAILURE_POINT(p + mcnt, d, -2);
5655 /* A smart repeat ends with `maybe_pop_jump'.
5656 We change it to either `pop_failure_jump' or `jump'. */
5657 case maybe_pop_jump:
5658 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5659 DEBUG_PRINT2("EXECUTING maybe_pop_jump %d.\n", mcnt);
5661 REGISTER unsigned char *p2 = p;
5663 /* Compare the beginning of the repeat with what
5664 in the pattern follows its end. If we can
5665 establish that there is nothing that they
5666 would both match, i.e., that we would have to
5667 backtrack because of (as in, e.g., `a*a')
5668 then we can change to pop_failure_jump,
5669 because we'll never have to backtrack.
5671 This is not true in the case of alternatives:
5672 in `(a|ab)*' we do need to backtrack to the
5673 `ab' alternative (e.g., if the string was
5674 `ab'). But instead of trying to detect that
5675 here, the alternative has put on a dummy
5676 failure point which is what we will end up
5679 /* Skip over open/close-group commands. If what
5680 follows this loop is a ...+ construct, look
5681 at what begins its body, since we will have
5682 to match at least one of that. */
5685 && ((re_opcode_t) * p2 ==
5687 || (re_opcode_t) * p2 ==
5690 else if (p2 + 6 < pend
5691 && (re_opcode_t) * p2 ==
5699 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
5700 to the `maybe_finalize_jump' of this case. Examine what
5703 /* If we're at the end of the pattern, we can change. */
5705 /* Consider what happens when matching ":\(.*\)"
5706 against ":/". I don't really understand this code
5708 p[-3] = (unsigned char)pop_failure_jump;
5710 (" End of pattern: change to `pop_failure_jump'.\n");
5713 else if ((re_opcode_t) * p2 == exactn
5714 || (bufp->newline_anchor
5715 && (re_opcode_t) * p2 ==
5717 REGISTER unsigned char c =
5719 (unsigned char)endline ? '\n' :
5722 if ((re_opcode_t) p1[3] == exactn
5728 (" %c != %c => pop_failure_jump.\n",
5732 else if ((re_opcode_t) p1[3] == charset
5733 || (re_opcode_t) p1[3] ==
5736 (re_opcode_t) p1[3] ==
5740 (unsigned char)(p1[4] *
5744 BYTEWIDTH] & (1 << (c
5749 /* `not_p' is equal to 1 if c would match, which means
5750 that we can't change to pop_failure_jump. */
5756 (" No match => pop_failure_jump.\n");
5759 } else if ((re_opcode_t) * p2 == charset) {
5760 #ifdef REGEX_DEBUG_FLAG
5761 REGISTER unsigned char c
5764 (unsigned char)endline ? '\n' :
5768 if ((re_opcode_t) p1[3] == exactn
5769 && !((int)p2[1] * BYTEWIDTH >
5771 && (p2[2 + p1[5] / BYTEWIDTH]
5779 (" %c != %c => pop_failure_jump.\n",
5783 else if ((re_opcode_t) p1[3] ==
5786 /* We win if the charset_not inside the loop
5787 lists every character listed in the charset after. */
5788 for (idx = 0; idx < (int)p2[1];
5806 (" No match => pop_failure_jump.\n");
5808 } else if ((re_opcode_t) p1[3] ==
5811 /* We win if the charset inside the loop
5812 has no overlap with the one after the loop. */
5815 && idx < (int)p1[4]; idx++)
5826 (" No match => pop_failure_jump.\n");
5831 p -= 2; /* Point at relative address again. */
5832 if ((re_opcode_t) p[-1] != pop_failure_jump) {
5833 p[-1] = (unsigned char)jump;
5834 DEBUG_PRINT1(" Match => jump.\n");
5835 goto unconditional_jump;
5837 /* Note fall through. */
5839 /* The end of a simple repeat has a pop_failure_jump
5840 back to its matching on_failure_jump, where the
5841 latter will push a failure point. The
5842 pop_failure_jump takes off failure points put on by
5843 this pop_failure_jump's matching on_failure_jump; we
5844 got through the pattern to here from the matching
5845 on_failure_jump, so didn't fail. */
5846 case pop_failure_jump: {
5847 /* We need to pass separate storage for the
5848 lowest and highest registers, even though we
5849 don't care about the actual values.
5850 Otherwise, we will restore only one register
5851 from the stack, since lowest will == highest
5852 in `pop_failure_point'. */
5853 int dummy_low_reg, dummy_high_reg;
5855 re_char *sdummy = NULL;
5857 DEBUG_PRINT1("EXECUTING pop_failure_jump.\n");
5858 POP_FAILURE_POINT(sdummy, pdummy,
5859 dummy_low_reg, dummy_high_reg,
5860 reg_dummy, reg_dummy,
5863 /* Note fall through. */
5865 /* Unconditionally jump (without popping any failure
5869 /* Get the amount to jump. */
5870 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5871 DEBUG_PRINT2("EXECUTING jump %d ", mcnt);
5872 p += mcnt; /* Do the jump. */
5873 DEBUG_PRINT2("(to 0x%lx).\n", (long)p);
5876 /* We need this opcode so we can detect where alternatives end
5877 in `group_match_null_string_p' et al. */
5879 DEBUG_PRINT1("EXECUTING jump_past_alt.\n");
5880 goto unconditional_jump;
5882 /* Normally, the on_failure_jump pushes a failure point, which
5883 then gets popped at pop_failure_jump. We will end up at
5884 pop_failure_jump, also, and with a pattern of, say, `a+', we
5885 are skipping over the on_failure_jump, so we have to push
5886 something meaningless for pop_failure_jump to pop. */
5887 case dummy_failure_jump:
5888 DEBUG_PRINT1("EXECUTING dummy_failure_jump.\n");
5889 /* It doesn't matter what we push for the string here. What
5890 the code at `fail' tests is the value for the pattern. */
5891 PUSH_FAILURE_POINT(NULL, NULL, -2);
5892 goto unconditional_jump;
5894 /* At the end of an alternative, we need to push a dummy failure
5895 point in case we are followed by a `pop_failure_jump', because
5896 we don't want the failure point for the alternative to be
5897 popped. For example, matching `(a|ab)*' against `aab'
5898 requires that we match the `ab' alternative. */
5899 case push_dummy_failure:
5900 DEBUG_PRINT1("EXECUTING push_dummy_failure.\n");
5901 /* See comments just above at `dummy_failure_jump' about the
5903 PUSH_FAILURE_POINT(NULL, NULL, -2);
5906 /* Have to succeed matching what follows at least n times.
5907 After that, handle like `on_failure_jump'. */
5909 EXTRACT_NUMBER(mcnt, p + 2);
5910 DEBUG_PRINT2("EXECUTING succeed_n %d.\n", mcnt);
5913 /* Originally, this is how many times we HAVE to succeed. */
5917 STORE_NUMBER_AND_INCR(p, mcnt);
5918 DEBUG_PRINT3(" Setting 0x%lx to %d.\n",
5920 } else if (mcnt == 0) {
5922 (" Setting two bytes from 0x%lx to no_op.\n",
5924 p[2] = (unsigned char)no_op;
5925 p[3] = (unsigned char)no_op;
5931 EXTRACT_NUMBER(mcnt, p + 2);
5932 DEBUG_PRINT2("EXECUTING jump_n %d.\n", mcnt);
5934 /* Originally, this is how many times we CAN jump. */
5937 STORE_NUMBER(p + 2, mcnt);
5938 goto unconditional_jump;
5940 /* If don't have to jump any more, skip over the rest of command. */
5947 DEBUG_PRINT1("EXECUTING set_number_at.\n");
5949 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5951 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5952 DEBUG_PRINT3(" Setting 0x%lx to %d.\n",
5954 STORE_NUMBER(p1, mcnt);
5959 DEBUG_PRINT1("EXECUTING wordbound.\n");
5964 /* Straightforward and (I hope) correct implementation.
5965 Probably should be optimized by arranging to compute
5967 /* emch1 is the character before d, syn1 is the syntax of
5968 emch1, emch2 is the character at d, and syn2 is the
5970 Emchar emch1, emch2;
5971 /* GCC isn't smart enough to see these are initialized if used. */
5972 int syn1 = 0, syn2 = 0;
5973 re_char *d_before, *d_after;
5975 at_beg = AT_STRINGS_BEG(d),
5976 at_end = AT_STRINGS_END(d);
5981 if (at_beg && at_end) {
5986 POS_BEFORE_GAP_UNSAFE(d);
5987 DEC_CHARPTR(d_before);
5989 charptr_emchar(d_before);
5992 SYNTAX_CACHE_BYTE_TO_CHAR
5993 (PTR_TO_OFFSET(d)) - 1;
5994 UPDATE_SYNTAX_CACHE(xpos);
5996 syn1 = SYNTAX_FROM_CACHE
5998 (regex_emacs_buffer->
5999 mirror_syntax_table),
6004 POS_AFTER_GAP_UNSAFE(d);
6005 emch2 = charptr_emchar(d_after);
6008 SYNTAX_CACHE_BYTE_TO_CHAR
6010 UPDATE_SYNTAX_CACHE_FORWARD(xpos
6014 syn2 = SYNTAX_FROM_CACHE
6016 (regex_emacs_buffer->
6017 mirror_syntax_table),
6022 result = (syn2 == Sword);
6024 result = (syn1 == Sword);
6031 if (result == should_succeed)
6037 DEBUG_PRINT1("EXECUTING notwordbound.\n");
6039 goto matchwordbound;
6042 DEBUG_PRINT1("EXECUTING wordbeg.\n");
6043 if (AT_STRINGS_END(d))
6046 /* XEmacs: this originally read:
6048 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
6052 re_char *dtmp = POS_AFTER_GAP_UNSAFE(d);
6053 Emchar emch = charptr_emchar(dtmp);
6056 SYNTAX_CACHE_BYTE_TO_CHAR(PTR_TO_OFFSET(d));
6057 UPDATE_SYNTAX_CACHE(charpos);
6059 if (SYNTAX_FROM_CACHE
6061 (regex_emacs_buffer->mirror_syntax_table),
6064 if (AT_STRINGS_BEG(d))
6066 dtmp = POS_BEFORE_GAP_UNSAFE(d);
6068 emch = charptr_emchar(dtmp);
6070 UPDATE_SYNTAX_CACHE_BACKWARD(charpos - 1);
6072 if (SYNTAX_FROM_CACHE
6074 (regex_emacs_buffer->mirror_syntax_table),
6081 DEBUG_PRINT1("EXECUTING wordend.\n");
6082 if (AT_STRINGS_BEG(d))
6085 /* XEmacs: this originally read:
6087 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
6088 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
6091 The or condition is incorrect (reversed).
6097 SYNTAX_CACHE_BYTE_TO_CHAR(PTR_TO_OFFSET(d))
6099 UPDATE_SYNTAX_CACHE(charpos);
6101 dtmp = POS_BEFORE_GAP_UNSAFE(d);
6103 emch = charptr_emchar(dtmp);
6104 if (SYNTAX_FROM_CACHE
6106 (regex_emacs_buffer->mirror_syntax_table),
6109 if (AT_STRINGS_END(d))
6111 dtmp = POS_AFTER_GAP_UNSAFE(d);
6112 emch = charptr_emchar(dtmp);
6114 UPDATE_SYNTAX_CACHE_FORWARD(charpos + 1);
6116 if (SYNTAX_FROM_CACHE
6118 (regex_emacs_buffer->mirror_syntax_table),
6126 DEBUG_PRINT1("EXECUTING before_dot.\n");
6128 (NILP(regex_match_object)
6129 || BUFFERP(regex_match_object))
6130 || (BUF_PTR_BYTE_POS(regex_emacs_buffer, d)
6131 >= BUF_PT(regex_emacs_buffer)))
6136 DEBUG_PRINT1("EXECUTING at_dot.\n");
6138 (NILP(regex_match_object)
6139 || BUFFERP(regex_match_object))
6140 || (BUF_PTR_BYTE_POS(regex_emacs_buffer, d)
6141 != BUF_PT(regex_emacs_buffer)))
6146 DEBUG_PRINT1("EXECUTING after_dot.\n");
6148 (NILP(regex_match_object)
6149 || BUFFERP(regex_match_object))
6150 || (BUF_PTR_BYTE_POS(regex_emacs_buffer, d)
6151 <= BUF_PT(regex_emacs_buffer)))
6154 #if 0 /* not emacs19 */
6156 DEBUG_PRINT1("EXECUTING at_dot.\n");
6157 if (BUF_PTR_BYTE_POS
6158 (regex_emacs_buffer,
6159 (unsigned char *)d) + 1 !=
6160 BUF_PT(regex_emacs_buffer))
6163 #endif /* not emacs19 */
6166 DEBUG_PRINT2("EXECUTING syntaxspec %d.\n", mcnt);
6171 DEBUG_PRINT1("EXECUTING Emacs wordchar.\n");
6184 SYNTAX_CACHE_BYTE_TO_CHAR
6186 UPDATE_SYNTAX_CACHE(charpos);
6190 emch = charptr_emchar((const Bufbyte *)d);
6194 (regex_emacs_buffer->mirror_syntax_table),
6195 emch) == (enum syntaxcode)mcnt);
6197 if (matches != should_succeed)
6204 DEBUG_PRINT2("EXECUTING notsyntaxspec %d.\n", mcnt);
6206 goto matchnotsyntax;
6209 DEBUG_PRINT1("EXECUTING Emacs notwordchar.\n");
6213 goto matchornotsyntax;
6216 /* 97/2/17 jhod Mule category code patch */
6225 emch = charptr_emchar((const Bufbyte *)d);
6227 if (check_category_char
6228 (emch, regex_emacs_buffer->category_table,
6229 mcnt, should_succeed))
6235 case notcategoryspec:
6237 goto matchornotcategory;
6238 /* end of category patch */
6240 #else /* not emacs */
6242 DEBUG_PRINT1("EXECUTING non-Emacs wordchar.\n");
6244 if (!WORDCHAR_P_UNSAFE((int)(*d)))
6251 DEBUG_PRINT1("EXECUTING non-Emacs notwordchar.\n");
6253 if (!WORDCHAR_P_UNSAFE((int)(*d)))
6263 /* Successfully executed one pattern command; keep going. */
6266 /* We goto here if a matching operation fails. */
6268 if (!FAIL_STACK_EMPTY()) {
6269 /* A restart point is known. Restore to that state. */
6270 DEBUG_PRINT1("\nFAIL:\n");
6271 POP_FAILURE_POINT(d, p /* used to be const char* */,
6272 lowest_active_reg, highest_active_reg,
6273 regstart, regend, reg_info);
6275 /* If this failure point is a dummy, try the next one. */
6279 /* If we failed to the end of the pattern, don't examine
6283 re_bool is_a_jump_n = false;
6285 /* If failed to a backwards jump that's part of
6286 a repetition loop, need to pop this failure
6287 point and use the next one. */
6288 switch ((unsigned int)(re_opcode_t)*p) {
6291 case maybe_pop_jump:
6292 case pop_failure_jump:
6295 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6299 && (re_opcode_t) * p1 == succeed_n)
6301 && (re_opcode_t) * p1 ==
6310 if (d >= string1 && d <= end1)
6313 break; /* Matching at this starting point really fails. */
6317 goto restore_best_regs;
6321 return -1; /* Failure to match. */
6324 /* Subroutine definitions for re_match_2. */
6326 /* We are passed P pointing to a register number after a start_memory.
6328 Return true if the pattern up to the corresponding stop_memory can
6329 match the empty string, and false otherwise.
6331 If we find the matching stop_memory, sets P to point to one past its number.
6332 Otherwise, sets P to an undefined byte less than or equal to END.
6334 We don't handle duplicates properly (yet). */
6337 group_match_null_string_p(unsigned char **p, unsigned char *end,
6338 register_info_type * register_info)
6341 /* Point to after the args to the start_memory. */
6342 unsigned char *p1 = *p + 2;
6345 /* Skip over opcodes that can match nothing, and return true or
6346 false, as appropriate, when we get to one that can't, or to the
6347 matching stop_memory. */
6349 switch ((unsigned int)(re_opcode_t)*p1) {
6350 /* Could be either a loop or a series of alternatives. */
6351 case on_failure_jump:
6353 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6355 /* If the next operation is not a jump backwards in the
6359 /* Go through the on_failure_jumps of the alternatives,
6360 seeing if any of the alternatives cannot match nothing.
6361 The last alternative starts with only a jump,
6362 whereas the rest start with on_failure_jump and end
6363 with a jump, e.g., here is the pattern for `a|b|c':
6365 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
6366 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
6369 So, we have to first go through the first (n-1)
6370 alternatives and then deal with the last one separately. */
6372 /* Deal with the first (n-1) alternatives, which start
6373 with an on_failure_jump (see above) that jumps to right
6374 past a jump_past_alt. */
6376 while ((re_opcode_t) p1[mcnt - 3] ==
6378 /* `mcnt' holds how many bytes long the alternative
6379 is, including the ending `jump_past_alt' and
6382 if (!alt_match_null_string_p
6383 (p1, p1 + mcnt - 3, register_info))
6386 /* Move to right after this alternative, including the
6390 /* Break if it's the beginning of an n-th alternative
6391 that doesn't begin with an on_failure_jump. */
6392 if ((re_opcode_t) * p1 !=
6396 /* Still have to check that it's not an n-th
6397 alternative that starts with an on_failure_jump. */
6399 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6400 if ((re_opcode_t) p1[mcnt - 3] !=
6402 /* Get to the beginning of the n-th alternative. */
6408 /* Deal with the last alternative: go back and get number
6409 of the `jump_past_alt' just before it. `mcnt' contains
6410 the length of the alternative. */
6411 EXTRACT_NUMBER(mcnt, p1 - 2);
6413 if (!alt_match_null_string_p
6414 (p1, p1 + mcnt, register_info))
6417 p1 += mcnt; /* Get past the n-th alternative. */
6422 assert(p1[1] == **p);
6427 if (!common_op_match_null_string_p(&p1, end, register_info))
6430 } /* while p1 < end */
6433 } /* group_match_null_string_p */
6435 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
6436 It expects P to be the first byte of a single alternative and END one
6437 byte past the last. The alternative can contain groups. */
6440 alt_match_null_string_p(unsigned char *p, unsigned char *end,
6441 register_info_type * register_info)
6444 unsigned char *p1 = p;
6447 /* Skip over opcodes that can match nothing, and break when we get
6448 to one that can't. */
6450 switch ((unsigned int)(re_opcode_t)*p1) {
6452 case on_failure_jump:
6454 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6459 if (!common_op_match_null_string_p(&p1, end, register_info))
6462 } /* while p1 < end */
6465 } /* alt_match_null_string_p */
6467 /* Deals with the ops common to group_match_null_string_p and
6468 alt_match_null_string_p.
6470 Sets P to one after the op and its arguments, if any. */
6473 common_op_match_null_string_p(unsigned char **p, unsigned char *end,
6474 register_info_type * register_info)
6479 unsigned char *p1 = *p;
6481 switch ((unsigned int)(re_opcode_t)*p1++) {
6500 assert(reg_no > 0 && reg_no <= MAX_REGNUM);
6501 ret = group_match_null_string_p(&p1, end, register_info);
6503 /* Have to set this here in case we're checking a group which
6504 contains a group and a back reference to it. */
6506 if (REG_MATCH_NULL_STRING_P(register_info[reg_no]) ==
6507 MATCH_NULL_UNSET_VALUE)
6508 REG_MATCH_NULL_STRING_P(register_info[reg_no]) = ret;
6514 /* If this is an optimized succeed_n for zero times, make the jump. */
6516 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6524 /* Get to the number of times to succeed. */
6526 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6530 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6537 if (!REG_MATCH_NULL_STRING_P(register_info[*p1]))
6545 /* All other opcodes mean we cannot match the empty string. */
6551 } /* common_op_match_null_string_p */
6553 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6554 bytes; nonzero otherwise. */
6557 bcmp_translate(re_char *s1, re_char *s2,
6558 REGISTER int len, RE_TRANSLATE_TYPE translate)
6560 REGISTER const unsigned char *p1 = s1, *p2 = s2;
6562 const unsigned char *p1_end = s1 + len;
6563 const unsigned char *p2_end = s2 + len;
6565 while (p1 != p1_end && p2 != p2_end) {
6566 Emchar p1_ch, p2_ch;
6568 p1_ch = charptr_emchar(p1);
6569 p2_ch = charptr_emchar(p2);
6571 if (RE_TRANSLATE(p1_ch)
6572 != RE_TRANSLATE(p2_ch))
6577 #else /* not MULE */
6579 if (RE_TRANSLATE(*p1++) != RE_TRANSLATE(*p2++))
6587 /* Entry points for GNU code. */
6589 /* re_compile_pattern is the GNU regular expression compiler: it
6590 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6591 Returns 0 if the pattern was valid, otherwise an error string.
6593 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6594 are set in BUFP on entry.
6596 We call regex_compile to do the actual compilation. */
6598 const char *re_compile_pattern(const char *pattern, int length,
6599 struct re_pattern_buffer *bufp)
6603 /* GNU code is written to assume at least RE_NREGS registers will be set
6604 (and at least one extra will be -1). */
6605 bufp->regs_allocated = REGS_UNALLOCATED;
6607 /* And GNU code determines whether or not to get register information
6608 by passing null for the REGS argument to re_match, etc., not by
6612 /* Match anchors at newline. */
6613 bufp->newline_anchor = 1;
6615 ret = regex_compile((const unsigned char*)pattern,
6616 length, re_syntax_options, bufp);
6620 return gettext(re_error_msgid[(int)ret]);
6623 /* Entry points compatible with 4.2 BSD regex library. We don't define
6624 them unless specifically requested. */
6626 #ifdef _REGEX_RE_COMP
6628 /* BSD has one and only one pattern buffer. */
6629 static struct re_pattern_buffer re_comp_buf;
6631 char *re_comp(const char *s)
6636 if (!re_comp_buf.buffer)
6637 return gettext("No previous regular expression");
6641 if (!re_comp_buf.buffer) {
6642 re_comp_buf.buffer = (unsigned char *)xmalloc_atomic(200);
6643 if (re_comp_buf.buffer == NULL)
6644 return gettext(re_error_msgid[(int)REG_ESPACE]);
6645 re_comp_buf.allocated = 200;
6647 re_comp_buf.fastmap = (char *)xmalloc_atomic(1 << BYTEWIDTH);
6648 if (re_comp_buf.fastmap == NULL)
6649 return gettext(re_error_msgid[(int)REG_ESPACE]);
6652 /* Since `re_exec' always passes NULL for the `regs' argument, we
6653 don't need to initialize the pattern buffer fields which affect it. */
6655 /* Match anchors at newlines. */
6656 re_comp_buf.newline_anchor = 1;
6659 regex_compile((unsigned char *)s, strlen(s), re_syntax_options,
6665 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6666 return (char *)gettext(re_error_msgid[(int)ret]);
6669 int re_exec(const char *s)
6671 const int len = strlen(s);
6673 0 <= re_search(&re_comp_buf, s, len, 0, len,
6674 (struct re_registers *)0);
6676 #endif /* _REGEX_RE_COMP */
6678 /* POSIX.2 functions. Don't define these for Emacs. */
6682 /* regcomp takes a regular expression as a string and compiles it.
6684 PREG is a regex_t *. We do not expect any fields to be initialized,
6685 since POSIX says we shouldn't. Thus, we set
6687 `buffer' to the compiled pattern;
6688 `used' to the length of the compiled pattern;
6689 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6690 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6691 RE_SYNTAX_POSIX_BASIC;
6692 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6693 `fastmap' and `fastmap_accurate' to zero;
6694 `re_nsub' to the number of subexpressions in PATTERN.
6695 (non-shy of course. POSIX probably doesn't know about
6696 shy ones, and in any case they should be invisible.)
6698 PATTERN is the address of the pattern string.
6700 CFLAGS is a series of bits which affect compilation.
6702 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6703 use POSIX basic syntax.
6705 If REG_NEWLINE is set, then . and [^...] don't match newline.
6706 Also, regexec will try a match beginning after every newline.
6708 If REG_ICASE is set, then we considers upper- and lowercase
6709 versions of letters to be equivalent when matching.
6711 If REG_NOSUB is set, then when PREG is passed to regexec, that
6712 routine will report only success or failure, and nothing about the
6715 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6716 the return codes and their meanings.) */
6718 int regcomp(regex_t * preg, const char *pattern, int cflags)
6722 = (cflags & REG_EXTENDED) ?
6723 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6725 /* regex_compile will allocate the space for the compiled pattern. */
6727 preg->allocated = 0;
6730 /* Don't bother to use a fastmap when searching. This simplifies the
6731 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
6732 characters after newlines into the fastmap. This way, we just try
6736 if (cflags & REG_ICASE) {
6739 preg->translate = (char *)xmalloc_atomic(CHAR_SET_SIZE);
6740 if (preg->translate == NULL)
6741 return (int)REG_ESPACE;
6743 /* Map uppercase characters to corresponding lowercase ones. */
6744 for (i = 0; i < CHAR_SET_SIZE; i++)
6745 preg->translate[i] = ISUPPER(i) ? tolower(i) : i;
6747 preg->translate = NULL;
6749 /* If REG_NEWLINE is set, newlines are treated differently. */
6750 if (cflags & REG_NEWLINE) {
6751 /* REG_NEWLINE implies neither . nor [^...] match newline. */
6752 syntax &= ~RE_DOT_NEWLINE;
6753 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6754 /* It also changes the matching behavior. */
6755 preg->newline_anchor = 1;
6757 preg->newline_anchor = 0;
6759 preg->no_sub = !!(cflags & REG_NOSUB);
6761 /* POSIX says a null character in the pattern terminates it, so we
6762 can use strlen here in compiling the pattern. */
6763 ret = regex_compile((const unsigned char*)pattern,
6764 strlen(pattern), syntax, preg);
6766 /* POSIX doesn't distinguish between an unmatched open-group and an
6767 unmatched close-group: both are REG_EPAREN. */
6768 if (ret == REG_ERPAREN)
6774 /* regexec searches for a given pattern, specified by PREG, in the
6777 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6778 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6779 least NMATCH elements, and we set them to the offsets of the
6780 corresponding matched substrings.
6782 EFLAGS specifies `execution flags' which affect matching: if
6783 REG_NOTBOL is set, then ^ does not match at the beginning of the
6784 string; if REG_NOTEOL is set, then $ does not match at the end.
6786 We return 0 if we find a match and REG_NOMATCH if not. */
6789 regexec(const regex_t * preg, const char *string, Element_count nmatch,
6790 regmatch_t pmatch[], int eflags)
6793 struct re_registers regs;
6794 regex_t private_preg;
6795 int len = strlen(string);
6796 re_bool want_reg_info = !preg->no_sub && nmatch > 0;
6798 private_preg = *preg;
6800 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6801 private_preg.not_eol = !!(eflags & REG_NOTEOL);
6803 /* The user has told us exactly how many registers to return
6804 information about, via `nmatch'. We have to pass that on to the
6805 matching routines. */
6806 private_preg.regs_allocated = REGS_FIXED;
6808 if (want_reg_info) {
6809 regs.num_regs = nmatch;
6810 regs.start = TALLOC(nmatch, regoff_t);
6811 regs.end = TALLOC(nmatch, regoff_t);
6812 if (regs.start == NULL || regs.end == NULL)
6813 return (int)REG_NOMATCH;
6816 /* Perform the searching operation. */
6817 ret = re_search(&private_preg, string, len,
6818 /* start: */ 0, /* range: */ len,
6819 want_reg_info ? ®s : (struct re_registers *)0);
6821 /* Copy the register information to the POSIX structure. */
6822 if (want_reg_info) {
6826 for (r = 0; r < nmatch; r++) {
6827 pmatch[r].rm_so = regs.start[r];
6828 pmatch[r].rm_eo = regs.end[r];
6832 /* If we needed the temporary register info, free the space now. */
6837 /* We want zero return to mean success, unlike `re_search'. */
6838 return ret >= 0 ? (int)REG_NOERROR : (int)REG_NOMATCH;
6841 /* Returns a message corresponding to an error code, ERRCODE, returned
6842 from either regcomp or regexec. We don't use PREG here. */
6845 regerror(int errcode, const regex_t * preg, char *errbuf,
6846 Memory_count errbuf_size)
6849 Memory_count msg_size;
6851 if (errcode < 0 || (size_t) errcode >= (sizeof(re_error_msgid)
6852 / sizeof(re_error_msgid[0])))
6853 /* Only error codes returned by the rest of the code should be passed
6854 to this routine. If we are given anything else, or if other regex
6855 code generates an invalid error code, then the program has a bug.
6856 Dump core so we can fix it. */
6859 msg = gettext(re_error_msgid[errcode]);
6861 msg_size = strlen(msg) + 1; /* Includes the null. */
6863 if (errbuf_size != 0) {
6864 strncpy(errbuf, msg, errbuf_size - 1);
6865 errbuf[errbuf_size-1]='\0';
6871 /* Free dynamically allocated space used by PREG. */
6873 void regfree(regex_t * preg)
6875 if (preg->buffer != NULL) {
6876 xfree(preg->buffer);
6878 preg->buffer = NULL;
6880 preg->allocated = 0;
6883 if (preg->fastmap != NULL) {
6884 xfree(preg->fastmap);
6886 preg->fastmap = NULL;
6887 preg->fastmap_accurate = 0;
6889 if (preg->translate != NULL) {
6890 xfree(preg->translate);
6892 preg->translate = NULL;
6895 #endif /* not emacs */
6899 make-backup-files: t
6901 trim-versions-without-asking: nil