1 /* Extended regular expression matching and search library,
2 version 0.12, extended for XEmacs.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
6 Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
7 Copyright (C) 1995 Sun Microsystems, Inc.
8 Copyright (C) 1995 Ben Wing.
10 This file is part of SXEmacs
12 SXEmacs is free software: you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation, either version 3 of the License, or
15 (at your option) any later version.
17 SXEmacs is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program. If not, see <http://www.gnu.org/licenses/>. */
26 /* Synched up with: FSF 19.29. */
28 /* Changes made for XEmacs:
30 (1) the REGEX_BEGLINE_CHECK code from the XEmacs v18 regex routines
31 was added. This causes a huge speedup in font-locking.
32 (2) Rel-alloc is disabled when the MMAP version of rel-alloc is
33 being used, because it's too slow -- all those calls to mmap()
34 add humongous overhead.
35 (3) Lots and lots of changes for Mule. They are bracketed by
36 `#ifdef MULE' or with comments that have `XEmacs' in them.
43 #ifndef REGISTER /* Rigidly enforced as of 20.3 */
52 /* Converts the pointer to the char to BEG-based offset from the start. */
53 #define PTR_TO_OFFSET(d) (MATCHING_IN_FIRST_STRING \
54 ? (d) - string1 : (d) - (string2 - size1))
56 #define PTR_TO_OFFSET(d) 0
59 /* We assume non-Mule if emacs isn't defined. */
64 /* We need this for `regex.h', and perhaps for the Emacs include files. */
65 #include <sys/types.h>
67 /* This is for other GNU distributions with internationalized messages. */
68 #if defined (I18N3) && (defined (HAVE_LIBINTL_H) || defined (_LIBC))
71 # define gettext(msgid) (msgid)
74 /* XEmacs: define this to add in a speedup for patterns anchored at
75 the beginning of a line. Keep the ifdefs so that it's easier to
76 tell where/why this code has diverged from v19. */
77 #define REGEX_BEGLINE_CHECK
79 /* XEmacs: the current mmap-based ralloc handles small blocks very
80 poorly, so we disable it here. */
82 #if (defined (REL_ALLOC) && defined (HAVE_MMAP)) || defined(HAVE_GLIBC)
86 #include "sxe-utils.h"
88 /* The `emacs' switch turns on certain matching commands
89 that make sense only in Emacs. */
98 Lisp_Object Vthe_lisp_rangetab;
100 void complex_vars_of_regex(void)
102 Vthe_lisp_rangetab = Fmake_range_table();
103 staticpro(&Vthe_lisp_rangetab);
108 void complex_vars_of_regex(void)
114 #define RE_TRANSLATE(ch) TRT_TABLE_OF (translate, (Emchar) ch)
115 #define TRANSLATE_P(tr) (!NILP (tr) && CHAR_TABLEP(tr))
117 /* just massage that xfree thing */
119 #if defined HAVE_BDWGC && defined EF_USE_BDWGC
127 /* If we are not linking with Emacs proper,
128 we can't use the relocating allocator
129 even if config.h says that we can. */
132 #if defined (STDC_HEADERS) || defined (_LIBC)
134 # include <stdbool.h>
140 /* Types normally included via lisp.h */
141 #include <stddef.h> /* for ptrdiff_t */
144 #ifndef DECLARE_NOTHING
145 #define DECLARE_NOTHING struct nosuchstruct
151 #define charptr_emchar(str) ((Emchar) (str)[0])
153 #define INC_CHARPTR(p) ((p)++)
154 #define DEC_CHARPTR(p) ((p)--)
158 /* Define the syntax stuff for \<, \>, etc. */
160 /* This must be nonzero for the wordchar and notwordchar pattern
161 commands in re_match_2. */
168 extern char *re_syntax_table;
170 #else /* not SYNTAX_TABLE */
172 /* How many characters in the character set. */
173 #define CHAR_SET_SIZE 256
175 static char re_syntax_table[CHAR_SET_SIZE];
177 static void init_syntax_once(void)
182 const char *word_syntax_chars =
183 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
185 memset(re_syntax_table, 0, sizeof(re_syntax_table));
187 while (*word_syntax_chars)
188 re_syntax_table[(unsigned int)(*word_syntax_chars++)] =
195 #endif /* SYNTAX_TABLE */
197 #define SYNTAX_UNSAFE(ignored, c) re_syntax_table[c]
198 #undef SYNTAX_FROM_CACHE
199 #define SYNTAX_FROM_CACHE SYNTAX_UNSAFE
201 #define RE_TRANSLATE(c) translate[(unsigned char) (c)]
202 #define TRANSLATE_P(tr) tr
204 #if defined HAVE_BDWGC && defined EF_USE_BDWGC
205 # if defined HAVE_GC_GC_H
207 # elif defined HAVE_GC_H
210 # define xmalloc GC_MALLOC
211 # define xmalloc_atomic GC_MALLOC_ATOMIC
212 # define xrealloc GC_REALLOC
213 # define xfree GC_FREE
214 # define xstrdup GC_STRDUP
215 #else /* !HAVE_BDWGC */
216 # define xmalloc malloc
217 # define xmalloc_atomic malloc
218 # define xrealloc realloc
220 # define xstrdup strdup
221 #endif /* HAVE_BDWGC */
226 /* Under XEmacs, this is needed because we don't define it elsewhere. */
227 #ifdef SWITCH_ENUM_BUG
228 #define SWITCH_ENUM_CAST(x) ((int)(x))
230 #define SWITCH_ENUM_CAST(x) (x)
233 /* Get the interface, including the syntax bits. */
236 /* isalpha etc. are used for the character classes. */
239 /* Jim Meyering writes:
241 "... Some ctype macros are valid only for character codes that
242 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
243 using /bin/cc or gcc but without giving an ansi option). So, all
244 ctype uses should be through macros like ISPRINT... If
245 STDC_HEADERS is defined, then autoconf has verified that the ctype
246 macros don't need to be guarded with references to isascii. ...
247 Defining isascii to 1 should let any compiler worth its salt
248 eliminate the && through constant folding." */
250 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
251 #define ISASCII_1(c) 1
253 #define ISASCII_1(c) isascii(c)
257 /* The IS*() macros can be passed any character, including an extended
258 one. We need to make sure there are no crashes, which would occur
259 otherwise due to out-of-bounds array references. */
260 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c))
262 #define ISASCII(c) ISASCII_1 (c)
266 #define ISBLANK(c) (ISASCII (c) && isblank (c))
268 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
271 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
273 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
276 /* stolen from FSF/src/charset.h */
277 #define SINGLE_BYTE_CHAR_P(c) (((unsigned int)(c) & 0xFF) == (c))
278 /* 1 if C is an ASCII character. */
279 #define IS_REAL_ASCII(c) ((c) < 128)
280 /* 1 if C is a unibyte character. */
281 #define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P(c))
283 #define ISPRINT(c) (ISASCII (c) && isprint (c))
284 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
285 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
286 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
287 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
288 #define ISLOWER(c) (ISASCII (c) && islower (c))
289 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
290 #define ISSPACE(c) (ISASCII (c) && isspace (c))
291 #define ISUPPER(c) (ISASCII (c) && isupper (c))
292 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
297 __attribute__((always_inline));
301 Lisp_Object ct = regex_emacs_buffer->mirror_syntax_table;
303 return (SYNTAX(XCHAR_TABLE(ct), c) == Sword);
306 # define ISWORD(c) (ISALPHA(c))
310 #define NULL (void *)0
313 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
314 since ours (we hope) works properly with all combinations of
315 machines, compilers, `char' and `unsigned char' argument types.
316 (Per Bothner suggested the basic approach.) */
317 #undef SIGN_EXTEND_CHAR
319 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
320 #else /* not __STDC__ */
321 /* As in Harbison and Steele. */
322 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
325 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
326 use `alloca' instead of `malloc'. This is because using malloc in
327 re_search* or re_match* could cause memory leaks when C-g is used in
328 Emacs; also, malloc is slower and causes storage fragmentation. On
329 the other hand, malloc is more portable, and easier to debug.
331 Because we sometimes use alloca, some routines have to be macros,
332 not functions -- `alloca'-allocated space disappears at the end of the
333 function it is called in. */
337 #define REGEX_ALLOCATE xmalloc
338 #define REGEX_REALLOCATE(source, osize, nsize) xrealloc(source, nsize)
339 #define REGEX_FREE xfree
341 #else /* !REGEX_MALLOC */
343 /* Emacs already defines alloca, sometimes. */
346 /* Make alloca work the best possible way. */
348 #define alloca __builtin_alloca
349 #else /* not __GNUC__ */
352 #else /* not __GNUC__ or HAVE_ALLOCA_H */
353 #ifndef _AIX /* Already did AIX, up at the top. */
355 #endif /* not _AIX */
356 #endif /* HAVE_ALLOCA_H */
357 #endif /* __GNUC__ */
359 #endif /* not alloca */
361 #define REGEX_ALLOCATE alloca
363 /* Assumes a `char *destination' variable. */
364 #define REGEX_REALLOCATE(source, osize, nsize) \
365 (destination = (char *) alloca (nsize), \
366 memmove (destination, source, osize), \
369 /* No need to do anything to free, after alloca. */
370 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
372 #endif /* REGEX_MALLOC */
374 /* Define how to allocate the failure stack. */
378 #define REGEX_ALLOCATE_STACK(size) \
379 r_alloc ((char **) &failure_stack_ptr, (size))
380 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
381 r_re_alloc ((char **) &failure_stack_ptr, (nsize))
382 #define REGEX_FREE_STACK(ptr) \
383 r_alloc_free ((void **) &failure_stack_ptr)
385 #else /* !REL_ALLOC */
389 #define REGEX_ALLOCATE_STACK xmalloc
390 #define REGEX_REALLOCATE_STACK(source, osize, nsize) xrealloc(source, nsize)
391 #define REGEX_FREE_STACK xfree
393 #else /* !REGEX_MALLOC */
395 #define REGEX_ALLOCATE_STACK alloca
397 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
398 REGEX_REALLOCATE (source, osize, nsize)
399 /* No need to explicitly free anything. */
400 #define REGEX_FREE_STACK(arg)
402 #endif /* REGEX_MALLOC */
403 #endif /* REL_ALLOC */
405 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
406 `string1' or just past its end. This works if PTR is NULL, which is
408 #define FIRST_STRING_P(ptr) \
409 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
411 /* (Re)Allocate N items of type T using malloc, or fail. */
412 #define TALLOC(n, t) \
413 ((t *)xmalloc((n) * sizeof (t)))
414 #define TALLOC_ATOMIC(n, t) \
415 ((t *)xmalloc_atomic((n) * sizeof (t)))
416 #define RETALLOC(addr, n, t) \
417 ((addr) = (t *)xrealloc(addr, (n) * sizeof (t)))
418 #define RETALLOC_IF(addr, n, t) \
420 RETALLOC((addr), (n), t); \
422 (addr) = TALLOC ((n), t)
423 #define REGEX_TALLOC(n, t) \
424 ((t *)REGEX_ALLOCATE((n) * sizeof (t)))
426 #define BYTEWIDTH 8 /* In bits. */
428 #define STREQ(s1, s2) (strcmp (s1, s2) == 0)
432 #define MAX(a, b) ((a) > (b) ? (a) : (b))
433 #define MIN(a, b) ((a) < (b) ? (a) : (b))
435 /* Type of source-pattern and string chars. */
436 typedef const unsigned char re_char;
438 typedef char re_bool;
442 /* These are the command codes that appear in compiled regular
443 expressions. Some opcodes are followed by argument bytes. A
444 command code can specify any interpretation whatsoever for its
445 arguments. Zero bytes may appear in the compiled regular expression. */
450 /* Succeed right away--no more backtracking. */
453 /* Followed by one byte giving n, then by n literal bytes. */
456 /* Matches any (more or less) character. */
459 /* Matches any one char belonging to specified set. First
460 following byte is number of bitmap bytes. Then come bytes
461 for a bitmap saying which chars are in. Bits in each byte
462 are ordered low-bit-first. A character is in the set if its
463 bit is 1. A character too large to have a bit in the map is
464 automatically not in the set. */
467 /* Same parameters as charset, but match any character that is
468 not one of those specified. */
471 /* Start remembering the text that is matched, for storing in a
472 register. Followed by one byte with the register number, in
473 the range 1 to the pattern buffer's re_ngroups
474 field. Then followed by one byte with the number of groups
475 inner to this one. (This last has to be part of the
476 start_memory only because we need it in the on_failure_jump
480 /* Stop remembering the text that is matched and store it in a
481 memory register. Followed by one byte with the register
482 number, in the range 1 to `re_ngroups' in the
483 pattern buffer, and one byte with the number of inner groups,
484 just like `start_memory'. (We need the number of inner
485 groups here because we don't have any easy way of finding the
486 corresponding start_memory when we're at a stop_memory.) */
489 /* Match a duplicate of something remembered. Followed by one
490 byte containing the register number. */
493 /* Fail unless at beginning of line. */
496 /* Fail unless at end of line. */
499 /* Succeeds if at beginning of buffer (if emacs) or at beginning
500 of string to be matched (if not). */
503 /* Analogously, for end of buffer/string. */
506 /* Followed by two byte relative address to which to jump. */
509 /* Same as jump, but marks the end of an alternative. */
512 /* Followed by two-byte relative address of place to resume at
513 in case of failure. */
516 /* Like on_failure_jump, but pushes a placeholder instead of the
517 current string position when executed. */
518 on_failure_keep_string_jump,
520 /* Throw away latest failure point and then jump to following
521 two-byte relative address. */
524 /* Change to pop_failure_jump if know won't have to backtrack to
525 match; otherwise change to jump. This is used to jump
526 back to the beginning of a repeat. If what follows this jump
527 clearly won't match what the repeat does, such that we can be
528 sure that there is no use backtracking out of repetitions
529 already matched, then we change it to a pop_failure_jump.
530 Followed by two-byte address. */
533 /* Jump to following two-byte address, and push a dummy failure
534 point. This failure point will be thrown away if an attempt
535 is made to use it for a failure. A `+' construct makes this
536 before the first repeat. Also used as an intermediary kind
537 of jump when compiling an alternative. */
540 /* Push a dummy failure point and continue. Used at the end of
544 /* Followed by two-byte relative address and two-byte number n.
545 After matching N times, jump to the address upon failure. */
548 /* Followed by two-byte relative address, and two-byte number n.
549 Jump to the address N times, then fail. */
552 /* Set the following two-byte relative address to the
553 subsequent two-byte number. The address *includes* the two
557 wordchar, /* Matches any word-constituent character. */
558 notwordchar, /* Matches any char that is not a word-constituent. */
560 wordbeg, /* Succeeds if at word beginning. */
561 wordend, /* Succeeds if at word end. */
563 wordbound, /* Succeeds if at a word boundary. */
564 notwordbound /* Succeeds if not at a word boundary. */
566 , before_dot, /* Succeeds if before point. */
567 at_dot, /* Succeeds if at point. */
568 after_dot, /* Succeeds if after point. */
570 /* Matches any character whose syntax is specified. Followed by
571 a byte which contains a syntax code, e.g., Sword. */
574 /* Matches any character whose syntax is not that specified. */
578 /* need extra stuff to be able to properly work with XEmacs/Mule
579 characters (which may take up more than one byte) */
580 , charset_mule, /* Matches any character belonging to specified set.
581 The set is stored in "unified range-table
582 format"; see rangetab.c. Unlike the `charset'
583 opcode, this can handle arbitrary characters. */
585 charset_mule_not /* Same parameters as charset_mule, but match any
586 character that is not one of those specified. */
587 /* 97/2/17 jhod: The following two were merged back in from the Mule
588 2.3 code to enable some language specific processing */
589 , categoryspec, /* Matches entries in the character category tables */
590 notcategoryspec /* The opposite of the above */
594 /* Common operations on the compiled pattern. */
596 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
598 #define STORE_NUMBER(destination, number) \
600 (destination)[0] = (number) & 0377; \
601 (destination)[1] = (number) >> 8; \
604 /* Same as STORE_NUMBER, except increment DESTINATION to
605 the byte after where the number is stored. Therefore, DESTINATION
606 must be an lvalue. */
608 #define STORE_NUMBER_AND_INCR(destination, number) \
610 STORE_NUMBER (destination, number); \
611 (destination) += 2; \
614 /* Put into DESTINATION a number stored in two contiguous bytes starting
617 #define EXTRACT_NUMBER(destination, source) \
619 (destination) = *(source) & 0377; \
620 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
623 #ifdef REGEX_DEBUG_FLAG
624 static void extract_number(int *dest, re_char * source)
626 int temp = SIGN_EXTEND_CHAR(*(source + 1));
627 *dest = *source & 0377;
631 #ifndef EXTRACT_MACROS /* To debug the macros. */
632 #undef EXTRACT_NUMBER
633 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
634 #endif /* not EXTRACT_MACROS */
636 #endif /* REGEX_DEBUG_FLAG */
638 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
639 SOURCE must be an lvalue. */
641 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
643 EXTRACT_NUMBER (destination, source); \
647 #ifdef REGEX_DEBUG_FLAG
649 extract_number_and_incr(int *destination, unsigned char **source)
651 extract_number(destination, *source);
655 #ifndef EXTRACT_MACROS
656 #undef EXTRACT_NUMBER_AND_INCR
657 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
658 extract_number_and_incr (&dest, &src)
659 #endif /* not EXTRACT_MACROS */
661 #endif /* REGEX_DEBUG_FLAG */
663 /* If REGEX_DEBUG_FLAG is defined, Regex prints many voluminous messages about
664 what it is doing (if the variable `debug' is nonzero). If linked with the
665 main program in `iregex.c', you can enter patterns and strings interactively.
666 And if linked with the main program in `main.c' and the other test files, you
667 can run the already-written tests. */
669 #if defined (REGEX_DEBUG_FLAG)
671 /* We use standard I/O for debugging. */
675 /* XEmacs provides its own version of assert() */
676 /* It is useful to test things that ``must'' be true when debugging. */
681 /* no point in having this one, since we do not offer any accessor to this */
682 static int debug = 0;
685 #define DEBUG_STATEMENT(e) e
686 #define DEBUG_PRINT1(x) printf(x)
687 #define DEBUG_PRINT2(x1, x2) printf(x1, x2)
688 #define DEBUG_PRINT3(x1, x2, x3) printf(x1, x2, x3)
689 #define DEBUG_PRINT4(x1, x2, x3, x4) printf(x1, x2, x3, x4)
690 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
691 print_partial_compiled_pattern(s, e)
692 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
693 print_double_string (w, s1, sz1, s2, sz2)
695 /* Print the fastmap in human-readable form. */
697 static void print_fastmap(char *fastmap)
699 unsigned was_a_range = 0;
702 while (i < (1 << BYTEWIDTH)) {
706 while (i < (1 << BYTEWIDTH) && fastmap[i]) {
719 /* Print a compiled pattern string in human-readable form, starting at
720 the START pointer into it and ending just before the pointer END. */
722 /* can't help it ... the ugly mule code changes `start' upon alignment,
723 * so this can't have the const qualifier ... bugger */
725 print_partial_compiled_pattern(unsigned char *start, re_char *end)
728 unsigned char *p = start;
736 /* Loop over pattern commands. */
738 printf("%ld:\t", (long)(p - start));
740 switch ((const re_opcode_t)*p++) {
747 printf("/exactn/%d", mcnt);
757 printf("/start_memory/%d/%d", mcnt, *p++);
762 printf("/stop_memory/%d/%d", mcnt, *p++);
766 printf("/duplicate/%d", *p++);
775 REGISTER int c, last = -100;
776 REGISTER int in_range = 0;
778 printf("/charset [%s",
779 (re_opcode_t) * (p - 1) == charset_not
782 assert(p + *p < pend);
784 for (c = 0; c < 256; c++)
785 if (((unsigned char)(c / 8) < *p)
788 /* Are we starting a range? */
789 if (last + 1 == c && !in_range) {
793 /* Have we broken a range? */
794 else if (last + 1 != c
817 case charset_mule_not: {
820 printf("/charset_mule [%s",
821 (re_opcode_t) * (p - 1) == charset_mule_not
823 /* u_r_t_nentries() CHANGES its arg, why on earth
824 * is this marked const here then? :O -hrop */
825 nentries = unified_range_table_nentries(p);
826 for (i = 0; i < nentries; i++) {
827 EMACS_INT first, last;
828 Lisp_Object dummy_val;
830 /* u_r_t_get_range CHANGES its arg ...
831 * p can't be const thence */
832 unified_range_table_get_range(p, i,
839 printf("(0x%lx)", (long)first);
850 p += unified_range_table_bytes_used(p);
863 case on_failure_jump:
864 extract_number_and_incr(&mcnt, &p);
865 printf("/on_failure_jump to %ld",
866 (long)(p + mcnt - start));
869 case on_failure_keep_string_jump:
870 extract_number_and_incr(&mcnt, &p);
871 printf("/on_failure_keep_string_jump to %ld",
872 (long)(p + mcnt - start));
875 case dummy_failure_jump:
876 extract_number_and_incr(&mcnt, &p);
877 printf("/dummy_failure_jump to %ld",
878 (long)(p + mcnt - start));
881 case push_dummy_failure:
882 printf("/push_dummy_failure");
886 extract_number_and_incr(&mcnt, &p);
887 printf("/maybe_pop_jump to %ld",
888 (long)(p + mcnt - start));
891 case pop_failure_jump:
892 extract_number_and_incr(&mcnt, &p);
893 printf("/pop_failure_jump to %ld",
894 (long)(p + mcnt - start));
898 extract_number_and_incr(&mcnt, &p);
899 printf("/jump_past_alt to %ld",
900 (long)(p + mcnt - start));
904 extract_number_and_incr(&mcnt, &p);
905 printf("/jump to %ld", (long)(p + mcnt - start));
909 extract_number_and_incr(&mcnt, &p);
910 extract_number_and_incr(&mcnt2, &p);
911 printf("/succeed_n to %ld, %d times",
912 (long)(p + mcnt - start), mcnt2);
916 extract_number_and_incr(&mcnt, &p);
917 extract_number_and_incr(&mcnt2, &p);
918 printf("/jump_n to %ld, %d times",
919 (long)(p + mcnt - start), mcnt2);
923 extract_number_and_incr(&mcnt, &p);
924 extract_number_and_incr(&mcnt2, &p);
925 printf("/set_number_at location %ld to %d",
926 (long)(p + mcnt - start), mcnt2);
930 printf("/wordbound");
934 printf("/notwordbound");
946 printf("/before_dot");
954 printf("/after_dot");
958 printf("/syntaxspec");
964 printf("/notsyntaxspec");
970 /* 97/2/17 jhod Mule category patch */
972 printf("/categoryspec");
977 case notcategoryspec:
978 printf("/notcategoryspec");
982 /* end of category patch */
991 printf("/notwordchar");
1004 printf("?%d", *(p - 1));
1010 printf("%ld:\tend of pattern.\n", (long)(p - start));
1014 print_compiled_pattern(struct re_pattern_buffer *bufp)
1018 re_char *buffer = bufp->buffer;
1020 unsigned char *buffer = bufp->buffer;
1023 print_partial_compiled_pattern(buffer, buffer + bufp->used);
1024 printf("%ld bytes used/%ld bytes allocated.\n", bufp->used,
1027 if (bufp->fastmap_accurate && bufp->fastmap) {
1028 printf("fastmap: ");
1029 print_fastmap(bufp->fastmap);
1032 printf("re_nsub: %ld\t", (long)bufp->re_nsub);
1033 printf("re_ngroups: %ld\t", (long)bufp->re_ngroups);
1034 printf("regs_alloc: %d\t", bufp->regs_allocated);
1035 printf("can_be_null: %d\t", bufp->can_be_null);
1036 printf("newline_anchor: %d\n", bufp->newline_anchor);
1037 printf("no_sub: %d\t", bufp->no_sub);
1038 printf("not_bol: %d\t", bufp->not_bol);
1039 printf("not_eol: %d\t", bufp->not_eol);
1040 printf("syntax: %d\n", bufp->syntax);
1041 /* Perhaps we should print the translate table? */
1042 /* and maybe the category table? */
1044 if (bufp->external_to_internal_register) {
1047 printf("external_to_internal_register:\n");
1048 for (i = 0; i <= bufp->re_nsub; i++) {
1051 printf("%d -> %d", i,
1052 bufp->external_to_internal_register[i]);
1059 print_double_string(re_char * where, re_char * string1, int size1,
1060 re_char * string2, int size2)
1065 Element_count this_char;
1067 if (FIRST_STRING_P(where)) {
1068 for (this_char = where - string1; this_char < size1;
1070 putchar(string1[this_char]);
1075 for (this_char = where - string2; this_char < size2;
1077 putchar(string2[this_char]);
1081 #else /* not REGEX_DEBUG_FLAG */
1086 #define DEBUG_STATEMENT(e)
1087 #define DEBUG_PRINT1(x)
1088 #define DEBUG_PRINT2(x1, x2)
1089 #define DEBUG_PRINT3(x1, x2, x3)
1090 #define DEBUG_PRINT4(x1, x2, x3, x4)
1091 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1092 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1094 #endif /* REGEX_DEBUG_FLAG */
1096 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1097 also be assigned to arbitrarily: each pattern buffer stores its own
1098 syntax, so it can be changed between regex compilations. */
1099 /* This has no initializer because initialized variables in Emacs
1100 become read-only after dumping. */
1101 reg_syntax_t re_syntax_options;
1103 /* Specify the precise syntax of regexps for compilation. This provides
1104 for compatibility for various utilities which historically have
1105 different, incompatible syntaxes.
1107 The argument SYNTAX is a bit mask comprised of the various bits
1108 defined in regex.h. We return the old syntax. */
1110 reg_syntax_t re_set_syntax(reg_syntax_t syntax)
1112 reg_syntax_t ret = re_syntax_options;
1114 re_syntax_options = syntax;
1118 /* This table gives an error message for each of the error codes listed
1119 in regex.h. Obviously the order here has to be same as there.
1120 POSIX doesn't require that we do anything for REG_NOERROR,
1121 but why not be nice? */
1123 static const char *re_error_msgid[] = {
1124 "Success", /* REG_NOERROR */
1125 "No match", /* REG_NOMATCH */
1126 "Invalid regular expression", /* REG_BADPAT */
1127 "Invalid collation character", /* REG_ECOLLATE */
1128 "Invalid character class name", /* REG_ECTYPE */
1129 "Trailing backslash", /* REG_EESCAPE */
1130 "Invalid back reference", /* REG_ESUBREG */
1131 "Unmatched [ or [^", /* REG_EBRACK */
1132 "Unmatched ( or \\(", /* REG_EPAREN */
1133 "Unmatched \\{", /* REG_EBRACE */
1134 "Invalid content of \\{\\}", /* REG_BADBR */
1135 "Invalid range end", /* REG_ERANGE */
1136 "Memory exhausted", /* REG_ESPACE */
1137 "Invalid preceding regular expression", /* REG_BADRPT */
1138 "Premature end of regular expression", /* REG_EEND */
1139 "Regular expression too big", /* REG_ESIZE */
1140 "Unmatched ) or \\)", /* REG_ERPAREN */
1142 "Invalid syntax designator", /* REG_ESYNTAX */
1145 "Ranges may not span charsets", /* REG_ERANGESPAN */
1146 "Invalid category designator", /* REG_ECATEGORY */
1150 /* Avoiding alloca during matching, to placate r_alloc. */
1152 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1153 searching and matching functions should not call alloca. On some
1154 systems, alloca is implemented in terms of malloc, and if we're
1155 using the relocating allocator routines, then malloc could cause a
1156 relocation, which might (if the strings being searched are in the
1157 ralloc heap) shift the data out from underneath the regexp
1160 Here's another reason to avoid allocation: Emacs
1161 processes input from X in a signal handler; processing X input may
1162 call malloc; if input arrives while a matching routine is calling
1163 malloc, then we're scrod. But Emacs can't just block input while
1164 calling matching routines; then we don't notice interrupts when
1165 they come in. So, Emacs blocks input around all regexp calls
1166 except the matching calls, which it leaves unprotected, in the
1167 faith that they will not malloc. */
1169 /* Normally, this is fine. */
1170 #define MATCH_MAY_ALLOCATE
1172 /* When using GNU C, we are not REALLY using the C alloca, no matter
1173 what config.h may say. So don't take precautions for it. */
1178 /* The match routines may not allocate if (1) they would do it with malloc
1179 and (2) it's not safe for them to use malloc.
1180 Note that if REL_ALLOC is defined, matching would not use malloc for the
1181 failure stack, but we would still use it for the register vectors;
1182 so REL_ALLOC should not affect this. */
1183 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1184 #undef MATCH_MAY_ALLOCATE
1187 /* Failure stack declarations and macros; both re_compile_fastmap and
1188 re_match_2 use a failure stack. These have to be macros because of
1189 REGEX_ALLOCATE_STACK. */
1191 /* Number of failure points for which to initially allocate space
1192 when matching. If this number is exceeded, we allocate more
1193 space, so it is not a hard limit. */
1194 #ifndef INIT_FAILURE_ALLOC
1195 #define INIT_FAILURE_ALLOC 20
1198 /* Roughly the maximum number of failure points on the stack. Would be
1199 exactly that if always used MAX_FAILURE_SPACE each time we failed.
1200 This is a variable only so users of regex can assign to it; we never
1201 change it ourselves. */
1202 #if defined (MATCH_MAY_ALLOCATE) || defined (REGEX_MALLOC)
1203 /* 4400 was enough to cause a crash on Alpha OSF/1,
1204 whose default stack limit is 2mb. */
1205 int re_max_failures = 50000;
1207 int re_max_failures = 2000;
1210 union fail_stack_elt {
1211 const unsigned char *pointer;
1215 typedef union fail_stack_elt fail_stack_elt_t;
1218 fail_stack_elt_t *stack;
1220 Element_count avail; /* Offset of next open position. */
1223 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1224 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1225 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1227 /* Define macros to initialize and free the failure stack.
1228 Do `return -2' if the alloc fails. */
1230 #ifdef MATCH_MAY_ALLOCATE
1231 #define INIT_FAIL_STACK() \
1233 fail_stack.stack = (fail_stack_elt_t *) \
1234 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * \
1235 sizeof (fail_stack_elt_t)); \
1237 if (fail_stack.stack == NULL) \
1240 fail_stack.size = INIT_FAILURE_ALLOC; \
1241 fail_stack.avail = 0; \
1244 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1246 #define INIT_FAIL_STACK() \
1248 fail_stack.avail = 0; \
1251 #define RESET_FAIL_STACK()
1254 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1256 Return 1 if succeeds, and 0 if either ran out of memory
1257 allocating space for it or it was already too large.
1259 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1261 #define DOUBLE_FAIL_STACK(fail_stack) \
1262 ((int) (fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
1264 : ((fail_stack).stack = (fail_stack_elt_t *) \
1265 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1266 (fail_stack).size * sizeof (fail_stack_elt_t), \
1267 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1269 (fail_stack).stack == NULL \
1271 : ((fail_stack).size <<= 1, \
1274 /* Push pointer POINTER on FAIL_STACK.
1275 Return 1 if was able to do so and 0 if ran out of memory allocating
1277 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1278 ((FAIL_STACK_FULL () \
1279 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1281 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1284 /* Push a pointer value onto the failure stack.
1285 Assumes the variable `fail_stack'. Probably should only
1286 be called from within `PUSH_FAILURE_POINT'. */
1287 #define PUSH_FAILURE_POINTER(item) \
1288 fail_stack.stack[fail_stack.avail++].pointer = \
1289 (const unsigned char*)(item)
1291 /* This pushes an integer-valued item onto the failure stack.
1292 Assumes the variable `fail_stack'. Probably should only
1293 be called from within `PUSH_FAILURE_POINT'. */
1294 #define PUSH_FAILURE_INT(item) \
1295 fail_stack.stack[fail_stack.avail++].integer = (item)
1297 /* Push a fail_stack_elt_t value onto the failure stack.
1298 Assumes the variable `fail_stack'. Probably should only
1299 be called from within `PUSH_FAILURE_POINT'. */
1300 #define PUSH_FAILURE_ELT(item) \
1301 fail_stack.stack[fail_stack.avail++] = (item)
1303 /* These three POP... operations complement the three PUSH... operations.
1304 All assume that `fail_stack' is nonempty. */
1305 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1306 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1307 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1309 /* Used to omit pushing failure point id's when we're not debugging. */
1310 #ifdef REGEX_DEBUG_FLAG
1311 #define DEBUG_PUSH PUSH_FAILURE_INT
1312 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1314 #define DEBUG_PUSH(item)
1315 #define DEBUG_POP(item_addr)
1318 /* Push the information about the state we will need
1319 if we ever fail back to it.
1321 Requires variables fail_stack, regstart, regend, reg_info, and
1322 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
1325 Does `return FAILURE_CODE' if runs out of memory. */
1327 #if !defined (REGEX_MALLOC) && !defined (REL_ALLOC)
1328 #define DECLARE_DESTINATION char *destination
1330 #define DECLARE_DESTINATION DECLARE_NOTHING
1333 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1335 DECLARE_DESTINATION; \
1336 /* Must be int, so when we don't save any registers, the\
1337 arithmetic \ of 0 + -1 isn't done as unsigned. */ \
1340 DEBUG_STATEMENT (failure_id++); \
1341 DEBUG_STATEMENT (nfailure_points_pushed++); \
1342 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1343 DEBUG_PRINT2 (" Before push, next avail: %lu\n", \
1344 (long unsigned int)(fail_stack).avail); \
1345 DEBUG_PRINT2 (" size: %lu\n", \
1346 (long unsigned int)(fail_stack).size); \
1348 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
1349 DEBUG_PRINT2 (" available: %ld\n", \
1350 (long) REMAINING_AVAIL_SLOTS); \
1352 /* Ensure we have enough space allocated for what \
1353 * we will push. */ \
1354 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) { \
1355 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1356 return failure_code; \
1358 DEBUG_PRINT2 ("\n Doubled stack; size now: %lu\n", \
1359 (unsigned long) (fail_stack).size); \
1360 DEBUG_PRINT2 (" slots available: %ld\n", \
1361 (long) REMAINING_AVAIL_SLOTS); \
1364 /* Push the info, starting with the registers. */ \
1365 DEBUG_PRINT1 ("\n"); \
1367 for (this_reg = lowest_active_reg; \
1368 this_reg <= highest_active_reg; \
1370 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1371 DEBUG_STATEMENT (num_regs_pushed++); \
1373 DEBUG_PRINT2(" start: %p\n", \
1374 regstart[this_reg]); \
1375 PUSH_FAILURE_POINTER(regstart[this_reg]); \
1377 DEBUG_PRINT2 (" end: %p\n", \
1378 regend[this_reg]); \
1379 PUSH_FAILURE_POINTER(regend[this_reg]); \
1381 DEBUG_PRINT2 (" info: 0x%p\n ", \
1382 ®_info[this_reg]); \
1383 DEBUG_PRINT2 (" match_null=%d", \
1384 REG_MATCH_NULL_STRING_P \
1385 (reg_info[this_reg])); \
1386 DEBUG_PRINT2 (" active=%d", \
1387 IS_ACTIVE (reg_info[this_reg])); \
1388 DEBUG_PRINT2 (" matched_something=%d", \
1390 (reg_info[this_reg])); \
1391 DEBUG_PRINT2 (" ever_matched_something=%d", \
1392 EVER_MATCHED_SOMETHING \
1393 (reg_info[this_reg])); \
1394 DEBUG_PRINT1 ("\n"); \
1395 PUSH_FAILURE_ELT(reg_info[this_reg].word); \
1398 DEBUG_PRINT2 (" Pushing low active reg: %d\n", \
1399 lowest_active_reg); \
1400 PUSH_FAILURE_INT (lowest_active_reg); \
1402 DEBUG_PRINT2 (" Pushing high active reg: %d\n", \
1403 highest_active_reg); \
1404 PUSH_FAILURE_INT (highest_active_reg); \
1406 DEBUG_PRINT2 (" Pushing pattern 0x%lx: \n", \
1407 (long) pattern_place); \
1408 DEBUG_PRINT_COMPILED_PATTERN(bufp, pattern_place, pend); \
1409 PUSH_FAILURE_POINTER (pattern_place); \
1411 DEBUG_PRINT2(" Pushing string %p: `", \
1413 DEBUG_PRINT_DOUBLE_STRING(string_place, \
1416 DEBUG_PRINT1("'\n"); \
1417 PUSH_FAILURE_POINTER (string_place); \
1419 DEBUG_PRINT2(" Pushing failure id: %u\n", failure_id); \
1420 DEBUG_PUSH(failure_id); \
1423 /* This is the number of items that are pushed and popped on the stack
1424 for each register. */
1425 #define NUM_REG_ITEMS 3
1427 /* Individual items aside from the registers. */
1429 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1431 #define NUM_NONREG_ITEMS 4
1434 /* We push at most this many items on the stack. */
1435 /* We used to use (num_regs - 1), which is the number of registers
1436 this regexp will save; but that was changed to 5
1437 to avoid stack overflow for a regexp with lots of parens. */
1438 #define MAX_FAILURE_ITEMS (5 * (NUM_REG_ITEMS + NUM_NONREG_ITEMS))
1440 /* We actually push this many items. */
1441 #define NUM_FAILURE_ITEMS \
1442 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
1445 /* How many items can still be added to the stack without overflowing it. */
1446 #define REMAINING_AVAIL_SLOTS ((int) ((fail_stack).size - (fail_stack).avail))
1448 /* Pops what PUSH_FAIL_STACK pushes.
1450 We restore into the parameters, all of which should be lvalues:
1451 STR -- the saved data position.
1452 PAT -- the saved pattern position.
1453 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1454 REGSTART, REGEND -- arrays of string positions.
1455 REG_INFO -- array of information about each subexpression.
1457 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1458 `pend', `string1', `size1', `string2', and `size2'. */
1460 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, \
1461 regstart, regend, reg_info) \
1464 const unsigned char *string_temp; \
1465 DEBUG_STATEMENT (fail_stack_elt_t ffailure_id); \
1467 assert (!FAIL_STACK_EMPTY ()); \
1469 /* Remove failure points and point to \
1470 * how many regs pushed. */ \
1471 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1472 DEBUG_PRINT2 (" Before pop, next avail: %lu\n", \
1473 (unsigned long) fail_stack.avail); \
1474 DEBUG_PRINT2 (" size: %lu\n", \
1475 (unsigned long) fail_stack.size); \
1477 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1479 DEBUG_POP (&ffailure_id.integer); \
1480 DEBUG_PRINT2 (" Popping failure id: %u\n", \
1481 * (unsigned int *) &ffailure_id); \
1483 /* If the saved string location is NULL, it \
1484 came from an on_failure_keep_string_jump opcode, \
1485 and we want to throw away the saved NULL, thus \
1486 retaining our current position in the string. */ \
1487 string_temp = (const unsigned char *) \
1488 POP_FAILURE_POINTER (); \
1489 if (string_temp != NULL) \
1490 str = string_temp; \
1492 DEBUG_PRINT2(" Popping string %p: `", str); \
1493 DEBUG_PRINT_DOUBLE_STRING(str, \
1496 DEBUG_PRINT1("'\n"); \
1498 pat = (const unsigned char*)POP_FAILURE_POINTER(); \
1499 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1500 DEBUG_PRINT_COMPILED_PATTERN(bufp, pat, pend); \
1502 /* Restore register info. */ \
1503 high_reg = (unsigned) POP_FAILURE_INT (); \
1504 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1506 low_reg = (unsigned) POP_FAILURE_INT (); \
1507 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1509 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) { \
1510 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
1512 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1513 DEBUG_PRINT2 (" info: %p\n", \
1514 ®_info[this_reg]); \
1516 regend[this_reg] = POP_FAILURE_POINTER (); \
1517 DEBUG_PRINT2 (" end: %p\n", \
1518 regend[this_reg]); \
1520 regstart[this_reg] = POP_FAILURE_POINTER (); \
1521 DEBUG_PRINT2 (" start: %p\n", \
1522 regstart[this_reg]); \
1525 set_regs_matched_done = 0; \
1526 DEBUG_STATEMENT (nfailure_points_popped++); \
1527 } while (0) /* POP_FAILURE_POINT */
1529 /* Structure for per-register (a.k.a. per-group) information.
1530 Other register information, such as the
1531 starting and ending positions (which are addresses), and the list of
1532 inner groups (which is a bits list) are maintained in separate
1535 We are making a (strictly speaking) nonportable assumption here: that
1536 the compiler will pack our bit fields into something that fits into
1537 the type of `word', i.e., is something that fits into one item on the
1541 fail_stack_elt_t word;
1543 /* This field is one if this group can match the empty string,
1544 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1545 #define MATCH_NULL_UNSET_VALUE 3
1546 unsigned match_null_string_p:2;
1547 unsigned is_active:1;
1548 unsigned matched_something:1;
1549 unsigned ever_matched_something:1;
1551 } register_info_type;
1553 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1554 #define IS_ACTIVE(R) ((R).bits.is_active)
1555 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1556 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1558 /* Call this when have matched a real character; it sets `matched' flags
1559 for the subexpressions which we are currently inside. Also records
1560 that those subexprs have matched. */
1561 #define SET_REGS_MATCHED() \
1564 if (!set_regs_matched_done) \
1567 set_regs_matched_done = 1; \
1568 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1570 MATCHED_SOMETHING (reg_info[r]) \
1571 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1578 /* Registers are set to a sentinel when they haven't yet matched. */
1579 static unsigned char reg_unset_dummy;
1580 #define REG_UNSET_VALUE (®_unset_dummy)
1581 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1583 /* Subroutine declarations and macros for regex_compile. */
1585 /* Fetch the next character in the uncompiled pattern---translating it
1586 if necessary. Also cast from a signed character in the constant
1587 string passed to us by the user to an unsigned char that we can use
1588 as an array index (in, e.g., `translate'). */
1589 #define PATFETCH(c) \
1592 c = TRANSLATE (c); \
1595 /* Fetch the next character in the uncompiled pattern, with no
1597 #define PATFETCH_RAW(c) \
1598 do {if (p == pend) return REG_EEND; \
1599 assert (p < pend); \
1600 c = charptr_emchar (p); \
1604 /* Go backwards one character in the pattern. */
1605 #define PATUNFETCH DEC_CHARPTR (p)
1609 #define PATFETCH_EXTENDED(emch) \
1610 do {if (p == pend) return REG_EEND; \
1611 assert (p < pend); \
1612 emch = charptr_emchar ((const Bufbyte *) p); \
1614 if (TRANSLATE_P (translate) && emch < 0x80) \
1615 emch = (Emchar) (unsigned char) RE_TRANSLATE (emch); \
1618 #define PATFETCH_RAW_EXTENDED(emch) \
1619 do {if (p == pend) return REG_EEND; \
1620 assert (p < pend); \
1621 emch = charptr_emchar ((const Bufbyte *) p); \
1625 #define PATUNFETCH_EXTENDED DEC_CHARPTR (p)
1627 #define PATFETCH_EITHER(emch) \
1629 if (has_extended_chars) \
1630 PATFETCH_EXTENDED (emch); \
1635 #define PATFETCH_RAW_EITHER(emch) \
1637 if (has_extended_chars) \
1638 PATFETCH_RAW_EXTENDED (emch); \
1640 PATFETCH_RAW (emch); \
1643 #define PATUNFETCH_EITHER \
1645 if (has_extended_chars) \
1646 PATUNFETCH_EXTENDED (emch); \
1648 PATUNFETCH (emch); \
1651 #else /* not MULE */
1653 #define PATFETCH_EITHER(emch) PATFETCH (emch)
1654 #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch)
1655 #define PATUNFETCH_EITHER PATUNFETCH
1659 /* If `translate' is non-null, return translate[D], else just D. We
1660 cast the subscript to translate because some data is declared as
1661 `char *', to avoid warnings when a string constant is passed. But
1662 when we use a character as a subscript we must make it unsigned. */
1663 #define TRANSLATE(d) (TRANSLATE_P (translate) ? RE_TRANSLATE (d) : (d))
1665 /* Macros for outputting the compiled pattern into `buffer'. */
1667 /* If the buffer isn't allocated when it comes in, use this. */
1668 #define INIT_BUF_SIZE 32
1670 /* Make sure we have at least N more bytes of space in buffer. */
1671 #define GET_BUFFER_SPACE(n) \
1672 while (buf_end - bufp->buffer + (n) > (ptrdiff_t) bufp->allocated) \
1675 /* Make sure we have one more byte of buffer space and then add C to it. */
1676 #define BUF_PUSH(c) \
1678 GET_BUFFER_SPACE (1); \
1679 *buf_end++ = (unsigned char) (c); \
1682 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1683 #define BUF_PUSH_2(c1, c2) \
1685 GET_BUFFER_SPACE (2); \
1686 *buf_end++ = (unsigned char) (c1); \
1687 *buf_end++ = (unsigned char) (c2); \
1690 /* As with BUF_PUSH_2, except for three bytes. */
1691 #define BUF_PUSH_3(c1, c2, c3) \
1693 GET_BUFFER_SPACE (3); \
1694 *buf_end++ = (unsigned char) (c1); \
1695 *buf_end++ = (unsigned char) (c2); \
1696 *buf_end++ = (unsigned char) (c3); \
1699 /* Store a jump with opcode OP at LOC to location TO. We store a
1700 relative address offset by the three bytes the jump itself occupies. */
1701 #define STORE_JUMP(op, loc, to) \
1702 store_op1 (op, loc, (to) - (loc) - 3)
1704 /* Likewise, for a two-argument jump. */
1705 #define STORE_JUMP2(op, loc, to, arg) \
1706 store_op2 (op, loc, (to) - (loc) - 3, arg)
1708 /* Like `STORE_JUMP', but for inserting. Assume `buf_end' is the
1710 #define INSERT_JUMP(op, loc, to) \
1711 insert_op1 (op, loc, (to) - (loc) - 3, buf_end)
1713 /* Like `STORE_JUMP2', but for inserting. Assume `buf_end' is the
1715 #define INSERT_JUMP2(op, loc, to, arg) \
1716 insert_op2 (op, loc, (to) - (loc) - 3, arg, buf_end)
1718 /* This is not an arbitrary limit: the arguments which represent offsets
1719 into the pattern are two bytes long. So if 2^16 bytes turns out to
1720 be too small, many things would have to change. */
1721 #define MAX_BUF_SIZE (1L << 16)
1723 /* Extend the buffer by twice its current size via realloc and
1724 reset the pointers that pointed into the old block to point to the
1725 correct places in the new one. If extending the buffer results in it
1726 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1727 #define EXTEND_BUFFER() \
1729 re_char *old_buffer = bufp->buffer; \
1730 if (bufp->allocated == MAX_BUF_SIZE) \
1732 bufp->allocated <<= 1; \
1733 if (bufp->allocated > MAX_BUF_SIZE) \
1734 bufp->allocated = MAX_BUF_SIZE; \
1735 bufp->buffer = (unsigned char *)xrealloc( \
1736 bufp->buffer, bufp->allocated); \
1737 if (bufp->buffer == NULL) { \
1738 return REG_ESPACE; \
1740 /* If the buffer moved, move all the pointers into it. */ \
1741 if (old_buffer != bufp->buffer) { \
1742 buf_end = (buf_end - old_buffer) + bufp->buffer; \
1743 begalt = (begalt - old_buffer) + bufp->buffer; \
1744 if (fixup_alt_jump) { \
1746 (fixup_alt_jump - old_buffer) + \
1750 laststart = (laststart - old_buffer) + \
1753 if (pending_exact) { \
1755 (pending_exact - old_buffer) + \
1761 /* Since we have one byte reserved for the register number argument to
1762 {start,stop}_memory, the maximum number of groups we can report
1763 things about is what fits in that byte. */
1764 #define MAX_REGNUM 255
1766 /* But patterns can have more than `MAX_REGNUM' registers. We just
1767 ignore the excess. */
1768 typedef unsigned regnum_t;
1770 #define INIT_REG_TRANSLATE_SIZE 5
1772 /* Macros for the compile stack. */
1774 /* Since offsets can go either forwards or backwards, this type needs to
1775 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1776 typedef int pattern_offset_t;
1779 pattern_offset_t begalt_offset;
1780 pattern_offset_t fixup_alt_jump;
1781 pattern_offset_t inner_group_offset;
1782 pattern_offset_t laststart_offset;
1784 } compile_stack_elt_t;
1787 compile_stack_elt_t *stack;
1789 unsigned avail; /* Offset of next open position. */
1790 } compile_stack_type;
1792 #define INIT_COMPILE_STACK_SIZE 32
1794 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1795 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1797 /* The next available element. */
1798 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1800 /* Set the bit for character C in a bit vector. */
1801 #define SET_LIST_BIT(c) \
1802 (buf_end[((unsigned char) (c)) / BYTEWIDTH] \
1803 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1807 /* Set the "bit" for character C in a range table. */
1808 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt)
1810 /* Set the "bit" for character c in the appropriate table. */
1811 #define SET_EITHER_BIT(c) \
1813 if (has_extended_chars) \
1814 SET_RANGETAB_BIT (c); \
1819 #else /* not MULE */
1821 #define SET_EITHER_BIT(c) SET_LIST_BIT (c)
1825 /* Get the next unsigned number in the uncompiled pattern. */
1826 #define GET_UNSIGNED_NUMBER(num) \
1830 while (ISDIGIT (c)) \
1834 num = num * 10 + c - '0'; \
1842 #define CHAR_CLASS_MAX_LENGTH 9 /* Namely, `multibyte'. */
1845 static void store_op1(re_opcode_t op, unsigned char *loc, int arg);
1846 static void store_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1847 static void insert_op1(re_opcode_t op, unsigned char *loc, int arg,
1848 unsigned char *end);
1849 static void insert_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2,
1850 unsigned char *end);
1851 static re_bool at_begline_loc_p(re_char*, re_char*, reg_syntax_t);
1852 static re_bool at_endline_loc_p(re_char*, re_char*, int syntax);
1853 static re_bool group_in_compile_stack(compile_stack_type compile_stack,
1855 static reg_errcode_t compile_range(re_char ** p_ptr, re_char * pend,
1856 RE_TRANSLATE_TYPE translate,
1857 reg_syntax_t syntax, unsigned char *b);
1859 static reg_errcode_t compile_extended_range(re_char ** p_ptr,
1861 RE_TRANSLATE_TYPE translate,
1862 reg_syntax_t syntax,
1865 static re_bool group_match_null_string_p(unsigned char **p,
1867 register_info_type * reg_info);
1868 static re_bool alt_match_null_string_p(unsigned char *p, unsigned char *end,
1869 register_info_type * reg_info);
1870 static re_bool common_op_match_null_string_p(unsigned char **p,
1872 register_info_type * reg_info);
1873 static int bcmp_translate(const unsigned char *s1, const unsigned char *s2,
1874 REGISTER int len, RE_TRANSLATE_TYPE translate);
1875 static int re_match_2_internal(struct re_pattern_buffer *bufp,
1876 re_char * string1, int size1,
1877 re_char * string2, int size2, int pos,
1878 struct re_registers *regs, int stop);
1880 #ifndef MATCH_MAY_ALLOCATE
1882 /* If we cannot allocate large objects within re_match_2_internal,
1883 we make the fail stack and register vectors global.
1884 The fail stack, we grow to the maximum size when a regexp
1886 The register vectors, we adjust in size each time we
1887 compile a regexp, according to the number of registers it needs. */
1889 static fail_stack_type fail_stack;
1891 /* Size with which the following vectors are currently allocated.
1892 That is so we can make them bigger as needed,
1893 but never make them smaller. */
1894 static int regs_allocated_size;
1896 static re_char **regstart, **regend;
1897 static re_char **old_regstart, **old_regend;
1898 static re_char **best_regstart, **best_regend;
1899 static register_info_type *reg_info;
1900 static re_char **reg_dummy;
1901 static register_info_type *reg_info_dummy;
1903 /* Make the register vectors big enough for NUM_REGS registers,
1904 but don't make them smaller. */
1906 static void regex_grow_registers(int num_regs)
1908 if (num_regs > regs_allocated_size) {
1909 RETALLOC_IF(regstart, num_regs, re_char *);
1910 RETALLOC_IF(regend, num_regs, re_char *);
1911 RETALLOC_IF(old_regstart, num_regs, re_char *);
1912 RETALLOC_IF(old_regend, num_regs, re_char *);
1913 RETALLOC_IF(best_regstart, num_regs, re_char *);
1914 RETALLOC_IF(best_regend, num_regs, re_char *);
1915 RETALLOC_IF(reg_info, num_regs, register_info_type);
1916 RETALLOC_IF(reg_dummy, num_regs, re_char *);
1917 RETALLOC_IF(reg_info_dummy, num_regs, register_info_type);
1919 regs_allocated_size = num_regs;
1923 #endif /* not MATCH_MAY_ALLOCATE */
1925 /* auxiliary stuff, FSF theft */
1926 /* Character classes. */
1929 RECC_ALNUM, RECC_ALPHA, RECC_WORD,
1930 RECC_GRAPH, RECC_PRINT,
1931 RECC_LOWER, RECC_UPPER,
1932 RECC_PUNCT, RECC_CNTRL,
1933 RECC_DIGIT, RECC_XDIGIT,
1934 RECC_BLANK, RECC_SPACE,
1935 RECC_MULTIBYTE, RECC_NONASCII,
1936 RECC_ASCII, RECC_UNIBYTE
1939 /* Map a string to the char class it names (if any). */
1940 static inline re_wctype_t
1941 re_wctype(re_char *str)
1943 const char *string = (const char*)str;
1945 if (STREQ (string, "alnum")) {
1947 } else if (STREQ (string, "alpha")) {
1949 } else if (STREQ (string, "digit")) {
1951 } else if (STREQ (string, "graph")) {
1953 } else if (STREQ (string, "space")) {
1955 } else if (STREQ (string, "upper")) {
1957 } else if (STREQ (string, "lower")) {
1959 } else if (STREQ (string, "word")) {
1961 } else if (STREQ (string, "print")) {
1963 } else if (STREQ (string, "punct")) {
1965 } else if (STREQ (string, "xdigit")) {
1967 } else if (STREQ (string, "cntrl")) {
1969 } else if (STREQ (string, "blank")) {
1971 } else if (STREQ (string, "ascii")) {
1973 } else if (STREQ (string, "nonascii")) {
1974 return RECC_NONASCII;
1975 } else if (STREQ (string, "unibyte")) {
1976 return RECC_UNIBYTE;
1977 } else if (STREQ (string, "multibyte")) {
1978 return RECC_MULTIBYTE;
1984 /* True if CH is in the char class CC. */
1986 re_iswctype(int ch, re_wctype_t cc)
1990 return ISALNUM (ch);
1992 return ISALPHA (ch);
1994 return ISBLANK (ch);
1996 return ISCNTRL (ch);
1998 return ISDIGIT (ch);
2000 return ISGRAPH (ch);
2002 return ISLOWER (ch);
2004 return ISPRINT (ch);
2006 return ISPUNCT (ch);
2008 return ISSPACE (ch);
2010 return ISUPPER (ch);
2012 return ISXDIGIT (ch);
2014 return IS_REAL_ASCII (ch);
2016 return !IS_REAL_ASCII (ch);
2020 return ISUNIBYTE((unsigned int)ch);
2021 case RECC_MULTIBYTE:
2022 return !ISUNIBYTE((unsigned int)ch);
2032 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2033 Returns one of error codes defined in `regex.h', or zero for success.
2035 Assumes the `allocated' (and perhaps `buffer') and `translate'
2036 fields are set in BUFP on entry.
2038 If it succeeds, results are put in BUFP (if it returns an error, the
2039 contents of BUFP are undefined):
2040 `buffer' is the compiled pattern;
2041 `syntax' is set to SYNTAX;
2042 `used' is set to the length of the compiled pattern;
2043 `fastmap_accurate' is zero;
2044 `re_ngroups' is the number of groups/subexpressions (including shy
2046 `re_nsub' is the number of non-shy groups in PATTERN;
2047 `not_bol' and `not_eol' are zero;
2049 The `fastmap' and `newline_anchor' fields are neither
2050 examined nor set. */
2052 static reg_errcode_t
2053 regex_compile(re_char * pattern, int size, reg_syntax_t syntax,
2054 struct re_pattern_buffer *bufp)
2056 /* We fetch characters from PATTERN here. We declare these as int
2057 (or possibly long) so that chars above 127 can be used as
2058 array indices. The macros that fetch a character from the pattern
2059 make sure to coerce to unsigned char before assigning, so we won't
2060 get bitten by negative numbers here. */
2061 /* XEmacs change: used to be unsigned char. */
2062 REGISTER EMACS_INT c, c1;
2064 /* A random temporary spot in PATTERN. */
2067 /* Points to the end of the buffer, where we should append. */
2068 REGISTER unsigned char *buf_end;
2070 /* Keeps track of unclosed groups. */
2071 compile_stack_type compile_stack;
2073 /* Points to the current (ending) position in the pattern. */
2074 re_char *p = pattern;
2075 re_char *pend = pattern + size;
2077 /* How to translate the characters in the pattern. */
2078 RE_TRANSLATE_TYPE translate = bufp->translate;
2080 /* Address of the count-byte of the most recently inserted `exactn'
2081 command. This makes it possible to tell if a new exact-match
2082 character can be added to that command or if the character requires
2083 a new `exactn' command. */
2084 unsigned char *pending_exact = 0;
2086 /* Address of start of the most recently finished expression.
2087 This tells, e.g., postfix * where to find the start of its
2088 operand. Reset at the beginning of groups and alternatives. */
2089 unsigned char *laststart = 0;
2091 /* Address of beginning of regexp, or inside of last group. */
2092 unsigned char *begalt;
2094 /* Place in the uncompiled pattern (i.e., the {) to
2095 which to go back if the interval is invalid. */
2096 re_char *beg_interval;
2098 /* Address of the place where a forward jump should go to the end of
2099 the containing expression. Each alternative of an `or' -- except the
2100 last -- ends with a forward jump of this sort. */
2101 unsigned char *fixup_alt_jump = 0;
2103 /* Counts open-groups as they are encountered. Remembered for the
2104 matching close-group on the compile stack, so the same register
2105 number is put in the stop_memory as the start_memory. */
2106 regnum_t regnum = 0;
2108 /* we need to close over compile_stack */
2109 # define free_stack(args...) xfree(compile_stack.stack)
2111 #ifdef REGEX_DEBUG_FLAG
2112 DEBUG_PRINT1("\nCompiling pattern: ");
2116 for (debug_count = 0; debug_count < size; debug_count++)
2117 putchar(pattern[debug_count]);
2122 /* Initialize the compile stack. */
2123 compile_stack.stack =
2124 TALLOC(INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2125 if (compile_stack.stack == NULL) {
2128 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2129 compile_stack.avail = 0;
2131 /* Initialize the pattern buffer. */
2132 bufp->syntax = syntax;
2133 bufp->fastmap_accurate = 0;
2134 bufp->not_bol = bufp->not_eol = 0;
2136 /* Set `used' to zero, so that if we return an error, the pattern
2137 printer (for debugging) will think there's no pattern. We reset it
2141 /* Always count groups, whether or not bufp->no_sub is set. */
2143 bufp->re_ngroups = 0;
2145 /* Allocate index translation array if needed. */
2146 if (bufp->external_to_internal_register == 0) {
2147 bufp->external_to_internal_register_size =
2148 INIT_REG_TRANSLATE_SIZE;
2149 RETALLOC(bufp->external_to_internal_register,
2150 bufp->external_to_internal_register_size, int);
2152 /* Initialize translations to impossible value to aid debugging. */
2156 bufp->external_to_internal_register[0] = 0;
2157 for (i = 1; i < bufp->external_to_internal_register_size; i++)
2158 bufp->external_to_internal_register[i] =
2162 #if !defined (emacs) && !defined (SYNTAX_TABLE)
2163 /* Initialize the syntax table. */
2167 if (bufp->allocated == 0) {
2169 /* If zero allocated, but buffer is non-null, try to
2170 realloc enough space. This loses if buffer's address
2171 is bogus, but that is the user's responsibility. */
2172 RETALLOC(bufp->buffer, INIT_BUF_SIZE, unsigned char);
2174 /* Caller did not allocate a buffer. Do it for them. */
2175 bufp->buffer = TALLOC(INIT_BUF_SIZE, unsigned char);
2177 if (!bufp->buffer) {
2182 bufp->allocated = INIT_BUF_SIZE;
2185 begalt = buf_end = bufp->buffer;
2187 /* Loop through the uncompiled pattern until we're at the end. */
2193 if ( /* If at start of pattern, it's an operator. */
2195 /* If context independent, it's an operator. */
2196 || syntax & RE_CONTEXT_INDEP_ANCHORS
2197 /* Otherwise, depends on what's come before. */
2198 || at_begline_loc_p(pattern, p, syntax))
2206 if ( /* If at end of pattern, it's an
2209 /* If context independent, it's an
2211 || syntax & RE_CONTEXT_INDEP_ANCHORS
2212 /* Otherwise, depends on what's
2214 || at_endline_loc_p(p, pend, syntax))
2223 if ((syntax & RE_BK_PLUS_QM) ||
2224 (syntax & RE_LIMITED_OPS)) {
2230 re_bool zero_times_ok;
2231 re_bool many_times_ok;
2234 /* If there is no previous pattern... */
2236 if (syntax & RE_CONTEXT_INVALID_OPS) {
2239 } else if (!(syntax & RE_CONTEXT_INDEP_OPS)) {
2244 /* true means zero/many matches are allowed. */
2245 zero_times_ok = c != '+';
2246 many_times_ok = c != '?';
2248 /* true means match shortest string possible. */
2251 /* If there is a sequence of repetition chars, collapse it
2252 down to just one (the right one). We can't combine
2253 interval operators with these because of, e.g., `a{2}*',
2254 which should only match an even number of `a's. */
2259 || (!(syntax & RE_BK_PLUS_QM)
2260 && (c == '+' || c == '?'))) ;
2262 else if (syntax & RE_BK_PLUS_QM
2270 if (!(c1 == '+' || c1 == '?')) {
2282 /* If we get here, we found another repeat
2284 if (!(syntax & RE_NO_MINIMAL_MATCHING)) {
2285 /* "*?" and "+?" and "??" are okay (and
2286 mean match minimally), but other
2287 sequences (such as "*??" and "+++")
2288 are rejected (reserved for future
2290 if (minimal || c != '?') {
2296 zero_times_ok |= c != '+';
2297 many_times_ok |= c != '?';
2301 /* Star, etc. applied to an empty pattern is equivalent
2302 to an empty pattern. */
2307 /* Now we know whether zero matches is allowed
2308 and whether two or more matches is allowed
2309 and whether we want minimal or maximal matching. */
2311 if (!many_times_ok) {
2313 0: /on_failure_jump to 6
2318 GET_BUFFER_SPACE(6);
2319 INSERT_JUMP(jump, laststart,
2322 INSERT_JUMP(on_failure_jump,
2326 } else if (zero_times_ok) {
2330 6: /on_failure_jump to 3
2333 GET_BUFFER_SPACE(6);
2334 INSERT_JUMP(jump, laststart,
2337 STORE_JUMP(on_failure_jump,
2344 3: /on_failure_jump to 0
2347 GET_BUFFER_SPACE(3);
2348 STORE_JUMP(on_failure_jump,
2349 buf_end, laststart);
2353 /* Are we optimizing this jump? */
2354 re_bool keep_string_p = false;
2356 if (many_times_ok) {
2357 /* More than one repetition is allowed,
2358 so put in at the end a backward
2359 relative jump from `buf_end' to
2360 before the next jump we're going to
2361 put in below (which jumps from
2362 laststart to after this jump).
2364 But if we are at the `*' in the exact
2365 sequence `.*\n', insert an
2366 unconditional jump backwards to the
2367 ., instead of the beginning of the
2368 loop. This way we only push a
2369 failure point once, instead of every
2370 time through the loop. */
2371 assert(p - 1 > pattern);
2373 /* Allocate the space for the jump. */
2374 GET_BUFFER_SPACE(3);
2376 /* We know we are not at the first
2377 character of the pattern, because
2378 laststart was nonzero. And we've
2379 already incremented `p', by the way,
2380 to be the character after the `*'.
2381 Do we have to do something analogous
2382 here for null bytes, because of
2384 if (*(p - 2) == '.' &&
2388 !(syntax & RE_DOT_NEWLINE)) {
2393 keep_string_p = true;
2395 /* Anything else. */
2401 /* We've added more stuff to the
2406 /* On failure, jump from laststart to buf_end +
2407 3, which will be the end of the buffer after
2408 this jump is inserted. */
2409 GET_BUFFER_SPACE(3);
2410 INSERT_JUMP(keep_string_p ?
2411 on_failure_keep_string_jump
2413 laststart, buf_end + 3);
2416 if (!zero_times_ok) {
2417 /* At least one repetition is required,
2418 so insert a `dummy_failure_jump'
2419 before the initial `on_failure_jump'
2420 instruction of the loop. This effects
2421 a skip over that instruction the
2422 first time we hit that loop. */
2423 GET_BUFFER_SPACE(3);
2424 INSERT_JUMP(dummy_failure_jump,
2435 laststart = buf_end;
2440 /* XEmacs change: this whole section */
2441 re_bool had_char_class = false;
2443 re_bool has_extended_chars = false;
2444 REGISTER Lisp_Object rtab = Qnil;
2452 /* Ensure that we have enough space to push a charset:
2453 the opcode, the length count, and the bitset; 34
2455 GET_BUFFER_SPACE(34);
2457 laststart = buf_end;
2459 /* We test `*p == '^' twice, instead of using an if
2460 statement, so we only need one BUF_PUSH. */
2461 BUF_PUSH(*p == '^' ? charset_not : charset);
2465 /* Remember the first position in the bracket
2469 /* Push the number of bytes in the bitmap. */
2470 BUF_PUSH((1 << BYTEWIDTH) / BYTEWIDTH);
2472 /* Clear the whole map. */
2473 memset(buf_end, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2475 /* charset_not matches newline according to a syntax
2477 if ((re_opcode_t) buf_end[-2] == charset_not
2478 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2482 start_over_with_extended:
2483 if (has_extended_chars) {
2484 /* There are extended chars here, which means we
2485 need to start over and shift to unified
2486 range-table format. */
2487 if (buf_end[-2] == charset)
2488 buf_end[-2] = charset_mule;
2490 buf_end[-2] = charset_mule_not;
2493 /* go back to the beginning of the charset,
2494 after a possible ^. */
2495 rtab = Vthe_lisp_rangetab;
2496 Fclear_range_table(rtab);
2498 /* charset_not matches newline according to a
2500 if ((re_opcode_t) buf_end[-1] ==
2503 RE_HAT_LISTS_NOT_NEWLINE))
2504 SET_EITHER_BIT('\n');
2508 /* Read in characters and ranges, setting map bits. */
2517 if (c >= 0x80 && !has_extended_chars) {
2518 has_extended_chars = 1;
2519 /* Frumble-bumble, we've found some
2520 extended chars. Need to start over,
2521 process everything using the general
2522 extended-char mechanism, and need to
2523 use charset_mule and charset_mule_not
2524 instead of charset and
2526 goto start_over_with_extended;
2529 /* \ might escape characters inside [...] and
2531 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) &&
2541 && !has_extended_chars) {
2542 has_extended_chars = 1;
2543 goto start_over_with_extended;
2550 /* Could be the end of the bracket
2551 expression. If it's not (i.e., when
2552 the bracket expression is `[]' so
2553 far), the ']' character bit gets set
2555 if (c == ']' && p != p1 + 1)
2558 /* Look ahead to see if it's a range
2559 when the last thing was a character
2561 if (had_char_class && c == '-'
2567 /* Look ahead to see if it's a range
2568 when the last thing was a character:
2569 if this is a hyphen not at the
2570 beginning or the end of a list, then
2571 it's the range operator. */
2573 && !(p - 2 >= pattern
2575 && !(p - 3 >= pattern
2582 if (*(const unsigned char *)p >= 0x80
2583 && !has_extended_chars) {
2584 has_extended_chars = 1;
2585 goto start_over_with_extended;
2587 if (has_extended_chars)
2588 ret = compile_extended_range
2589 (&p, pend, translate,
2594 (&p, pend, translate,
2596 if (ret != REG_NOERROR) {
2602 else if (p[0] == '-' && p[1] != ']') {
2603 /* This handles ranges made up of
2607 /* Move past the `-'. */
2611 if (*(const unsigned char*)p >= 0x80
2612 && !has_extended_chars) {
2613 has_extended_chars = 1;
2614 goto start_over_with_extended;
2616 if (has_extended_chars)
2617 ret = compile_extended_range(
2618 &p, pend, translate,
2622 ret = compile_range(
2623 &p, pend, translate,
2625 if (ret != REG_NOERROR) {
2631 /* See if we're at the beginning of a possible
2634 else if (syntax & RE_CHAR_CLASSES &&
2635 c == '[' && *p == ':') {
2636 /* Leave room for the null. */
2637 char str[CHAR_CLASS_MAX_LENGTH + 1];
2642 /* If pattern is `[[:'. */
2648 /* #### This code is unused.
2649 Correctness is not checked
2650 after TRT table change. */
2652 if (c == ':' || c == ']'
2655 CHAR_CLASS_MAX_LENGTH)
2657 str[c1++] = (char)c;
2661 /* If isn't a word bracketed by
2662 `[:' and `:]': undo the
2663 ending character, the
2664 letters, and leave the
2665 leading `:' and `[' (but set
2667 if (c == ':' && *p == ']') {
2673 if (wct == RECC_ERROR) {
2678 /* Throw away the ] at
2698 re_iswctype(ch, wct)) {
2702 if (re_iswctype(ch, wct)) {
2707 had_char_class = true;
2713 SET_EITHER_BIT('[');
2714 SET_EITHER_BIT(':');
2715 had_char_class = false;
2718 had_char_class = false;
2724 if (has_extended_chars) {
2725 /* We have a range table, not a bit vector. */
2727 unified_range_table_bytes_needed
2729 GET_BUFFER_SPACE(bytes_needed);
2730 unified_range_table_copy_data(rtab,
2733 unified_range_table_bytes_used
2738 /* Discard any (non)matching list bytes that are
2739 all 0 at the end of the map. Decrease the
2740 map-length byte too. */
2741 while ((int)buf_end[-1] > 0
2742 && buf_end[buf_end[-1] - 1] == 0)
2744 buf_end += buf_end[-1];
2749 if (syntax & RE_NO_BK_PARENS)
2755 if (syntax & RE_NO_BK_PARENS)
2761 if (syntax & RE_NEWLINE_ALT)
2767 if (syntax & RE_NO_BK_VBAR)
2773 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2774 goto handle_interval;
2784 /* Do not translate the character after the \, so that we can
2785 distinguish, e.g., \B from \b, even if we normally would
2786 translate, e.g., B to b. */
2791 if (syntax & RE_NO_BK_PARENS)
2792 goto normal_backslash;
2799 if (!(syntax & RE_NO_SHY_GROUPS)
2800 && p != pend && *p == '?') {
2821 /* Record the translation from capturing group index to
2822 register number, reallocating table as needed. */
2825 external_to_internal_register_size
2830 external_to_internal_register_size;
2832 external_to_internal_register_size
2835 external_to_internal_register,
2837 external_to_internal_register_size,
2843 external_to_internal_register_size;
2846 external_to_internal_register
2853 external_to_internal_register
2858 if (COMPILE_STACK_FULL) {
2859 RETALLOC(compile_stack.stack,
2862 compile_stack_elt_t);
2863 if (compile_stack.stack == NULL)
2866 compile_stack.size <<= 1;
2869 /* These are the values to restore when
2870 we hit end of this group. They are
2871 all relative offsets, so that if the
2872 whole pattern moves because of
2873 realloc, they will still be
2875 COMPILE_STACK_TOP.begalt_offset =
2876 begalt - bufp->buffer;
2877 COMPILE_STACK_TOP.fixup_alt_jump =
2882 COMPILE_STACK_TOP.laststart_offset =
2883 buf_end - bufp->buffer;
2884 COMPILE_STACK_TOP.regnum = r;
2886 /* We will eventually replace the 0 with
2887 the number of groups inner to this
2888 one. But do not push a start_memory
2889 for groups beyond the last one we can
2890 represent in the compiled pattern.
2891 #### bad bad bad. this will fail in
2892 lots of ways, if we ever have to
2893 backtrack for these groups.
2895 if (r <= MAX_REGNUM) {
2897 inner_group_offset =
2898 buf_end - bufp->buffer +
2900 BUF_PUSH_3(start_memory, r, 0);
2903 compile_stack.avail++;
2908 /* If we've reached MAX_REGNUM groups,
2909 then this open won't actually
2910 generate any code, so we'll have to
2911 clear pending_exact explicitly. */
2917 if (syntax & RE_NO_BK_PARENS)
2918 goto normal_backslash;
2920 if (COMPILE_STACK_EMPTY) {
2922 RE_UNMATCHED_RIGHT_PAREN_ORD) {
2923 goto normal_backslash;
2931 if (fixup_alt_jump) {
2932 /* Push a dummy failure point at the end
2933 of the alternative for a possible
2934 future `pop_failure_jump' to pop.
2935 See comments at `push_dummy_failure'
2937 BUF_PUSH(push_dummy_failure);
2939 /* We allocated space for this jump when
2940 we assigned to `fixup_alt_jump', in
2941 the `handle_alt' case below. */
2942 STORE_JUMP(jump_past_alt,
2943 fixup_alt_jump, buf_end - 1);
2946 /* See similar code for backslashed left paren
2948 if (COMPILE_STACK_EMPTY) {
2950 RE_UNMATCHED_RIGHT_PAREN_ORD) {
2958 /* Since we just checked for an empty stack
2959 above, this ``can't happen''. */
2960 assert(compile_stack.avail != 0);
2962 /* We don't just want to restore into
2963 `regnum', because later groups should
2964 continue to be numbered higher, as in
2965 `(ab)c(de)' -- the second group is
2967 regnum_t this_group_regnum;
2969 compile_stack.avail--;
2972 COMPILE_STACK_TOP.begalt_offset;
2985 COMPILE_STACK_TOP.regnum;
2986 /* If we've reached MAX_REGNUM groups,
2987 then this open won't actually
2988 generate any code, so we'll have to
2989 clear pending_exact explicitly. */
2992 /* We're at the end of the group, so now
2993 we know how many groups were inside
2995 if (this_group_regnum <= MAX_REGNUM) {
2996 unsigned char *inner_group_loc
3005 BUF_PUSH_3(stop_memory,
3013 case '|': /* `\|'. */
3014 if (syntax & RE_LIMITED_OPS
3015 || syntax & RE_NO_BK_VBAR) {
3016 goto normal_backslash;
3019 if (syntax & RE_LIMITED_OPS) {
3023 /* Insert before the previous alternative a jump
3024 which jumps to this alternative if the former
3026 GET_BUFFER_SPACE(3);
3027 INSERT_JUMP(on_failure_jump, begalt,
3032 /* The alternative before this one has a jump after it
3033 which gets executed if it gets matched. Adjust that
3034 jump so it will jump to this alternative's analogous
3035 jump (put in below, which in turn will jump to the next
3036 (if any) alternative's such jump, etc.). The last such
3037 jump jumps to the correct final destination. A picture:
3043 If we are at `b', then fixup_alt_jump right now points to a
3044 three-byte space after `a'. We'll put in the jump, set
3045 fixup_alt_jump to right after `b', and leave behind three
3046 bytes which we'll fill in when we get to after `c'. */
3049 STORE_JUMP(jump_past_alt,
3050 fixup_alt_jump, buf_end);
3052 /* Mark and leave space for a jump after this alternative,
3053 to be filled in later either by next alternative or
3054 when know we're at the end of a series of alternatives. */
3055 fixup_alt_jump = buf_end;
3056 GET_BUFFER_SPACE(3);
3064 /* If \{ is a literal. */
3065 if (!(syntax & RE_INTERVALS)
3066 /* If we're at `\{' and it's not the open-interval
3068 || ((syntax & RE_INTERVALS)
3069 && (syntax & RE_NO_BK_BRACES))
3070 || (p - 2 == pattern && p == pend))
3071 goto normal_backslash;
3075 /* If got here, then the syntax allows
3078 /* At least (most) this many matches
3080 int lower_bound = -1, upper_bound = -1;
3082 beg_interval = p - 1;
3085 if (syntax & RE_NO_BK_BRACES) {
3086 goto unfetch_interval;
3093 GET_UNSIGNED_NUMBER(lower_bound);
3096 GET_UNSIGNED_NUMBER(
3098 if (upper_bound < 0) {
3103 /* Interval such as `{1}' =>
3104 match exactly once. */
3105 upper_bound = lower_bound;
3109 || upper_bound > RE_DUP_MAX
3110 || lower_bound > upper_bound) {
3111 if (syntax & RE_NO_BK_BRACES) {
3112 goto unfetch_interval;
3119 if (!(syntax & RE_NO_BK_BRACES)) {
3128 if (syntax & RE_NO_BK_BRACES) {
3129 goto unfetch_interval;
3136 /* We just parsed a valid interval. */
3138 /* If it's invalid to have no preceding
3142 RE_CONTEXT_INVALID_OPS) {
3147 RE_CONTEXT_INDEP_OPS) {
3148 laststart = buf_end;
3150 goto unfetch_interval;
3154 /* If the upper bound is zero, don't
3155 want to succeed at all; jump from
3156 `laststart' to `b + 3', which will be
3157 the end of the buffer after we insert
3159 if (upper_bound == 0) {
3160 GET_BUFFER_SPACE(3);
3161 INSERT_JUMP(jump, laststart,
3166 /* Otherwise, we have a nontrivial interval. When
3167 we're all done, the pattern will look like:
3168 set_number_at <jump count> <upper bound>
3169 set_number_at <succeed_n count> <lower bound>
3170 succeed_n <after jump addr> <succeed_n count>
3172 jump_n <succeed_n addr> <jump count>
3173 (The upper bound and `jump_n' are omitted if
3174 `upper_bound' is 1, though.) */
3175 else { /* If the upper bound is > 1, we need to insert
3176 more at the end of the loop. */
3177 Memory_count nbytes =
3178 10 + (upper_bound > 1) * 10;
3180 GET_BUFFER_SPACE(nbytes);
3182 /* Initialize lower bound of the `succeed_n', even
3183 though it will be set during matching by its
3184 attendant `set_number_at' (inserted next),
3185 because `re_compile_fastmap' needs to know.
3186 Jump to the `jump_n' we might insert below. */
3187 INSERT_JUMP2(succeed_n,
3195 /* Code to initialize the lower bound. Insert
3196 before the `succeed_n'. The `5' is the last two
3197 bytes of this `set_number_at', plus 3 bytes of
3198 the following `succeed_n'. */
3199 insert_op2(set_number_at,
3205 if (upper_bound > 1) { /* More than one repetition is allowed, so
3206 append a backward jump to the `succeed_n'
3207 that starts this interval.
3209 When we've reached this during matching,
3210 we'll have matched the interval once, so
3211 jump back only `upper_bound - 1' times. */
3220 /* The location we want to set is the second
3221 parameter of the `jump_n'; that is `b-2' as
3222 an absolute address. `laststart' will be
3223 the `set_number_at' we're about to insert;
3224 `laststart+3' the number to set, the source
3225 for the relative address. But we are
3226 inserting into the middle of the pattern --
3227 so everything is getting moved up by 5.
3228 Conclusion: (b - 2) - (laststart + 3) + 5,
3229 i.e., b - laststart.
3231 We insert this at the beginning of the loop
3232 so that if we fail during matching, we'll
3233 reinitialize the bounds. */
3245 beg_interval = NULL;
3250 /* If an invalid interval, match the characters as literals. */
3251 assert(beg_interval);
3253 beg_interval = NULL;
3255 /* normal_char and normal_backslash need `c'. */
3258 if (!(syntax & RE_NO_BK_BRACES)) {
3259 if (p > pattern && p[-1] == '\\')
3260 goto normal_backslash;
3265 /* There is no way to specify the before_dot and after_dot
3266 operators. rms says this is ok. --karl */
3272 laststart = buf_end;
3274 /* XEmacs addition */
3275 if (c >= 0x80 || syntax_spec_code[c] == 0377) {
3279 BUF_PUSH_2(syntaxspec, syntax_spec_code[c]);
3283 laststart = buf_end;
3285 /* XEmacs addition */
3286 if (c >= 0x80 || syntax_spec_code[c] == 0377) {
3290 BUF_PUSH_2(notsyntaxspec, syntax_spec_code[c]);
3294 /* 97.2.17 jhod merged in to XEmacs from mule-2.3 */
3296 laststart = buf_end;
3298 if (c < 32 || c > 127) {
3300 return REG_ECATEGORY;
3302 BUF_PUSH_2(categoryspec, c);
3306 laststart = buf_end;
3308 if (c < 32 || c > 127) {
3310 return REG_ECATEGORY;
3312 BUF_PUSH_2(notcategoryspec, c);
3314 /* end of category patch */
3319 laststart = buf_end;
3324 laststart = buf_end;
3325 BUF_PUSH(notwordchar);
3337 BUF_PUSH(wordbound);
3341 BUF_PUSH(notwordbound);
3363 if (syntax & RE_NO_BK_REFS) {
3366 /* External register indexing. */
3369 if (reg > bufp->re_nsub) {
3374 /* Convert external to internal as soon
3376 reg = bufp->external_to_internal_register[reg];
3378 /* Can't back reference to a
3379 subexpression if inside it. */
3380 if (group_in_compile_stack(
3381 compile_stack, reg)) {
3384 laststart = buf_end;
3385 BUF_PUSH_2(duplicate, reg);
3391 if (syntax & RE_BK_PLUS_QM) {
3394 goto normal_backslash;
3398 /* You might think it would be useful for \ to
3399 mean not to translate; but if we don't
3400 translate it, it will never match
3407 /* Expects the character in `c'. */
3408 /* `p' points to the location after where `c' came
3412 /* XEmacs: modifications here for Mule. */
3413 /* `q' points to the beginning of the next char. */
3416 /* If no exactn currently being built. */
3418 /* If last exactn not at current position. */
3419 || pending_exact + *pending_exact + 1 !=
3421 /* We have only one byte following the exactn for
3423 || ((unsigned int)(*pending_exact + (q - p)) >=
3424 ((unsigned int)(1 << BYTEWIDTH) - 1))
3426 /* If followed by a repetition operator. */
3427 || *q == '*' || *q == '^'
3428 || ((syntax & RE_BK_PLUS_QM)
3429 ? *q == '\\' && (q[1] == '+'
3431 : (*q == '+' || *q == '?'))
3432 || ((syntax & RE_INTERVALS)
3433 && ((syntax & RE_NO_BK_BRACES)
3435 : (q[0] == '\\' && q[1] == '{')))) {
3436 /* Start building a new exactn. */
3438 laststart = buf_end;
3440 BUF_PUSH_2(exactn, 0);
3441 pending_exact = buf_end - 1;
3449 Bufbyte tmp_buf[MAX_EMCHAR_LEN];
3453 set_charptr_emchar(tmp_buf, c);
3455 for (i = 0; i < bt_count; i++) {
3456 BUF_PUSH(tmp_buf[i]);
3464 } /* while p != pend */
3466 /* Through the pattern now. */
3468 if (fixup_alt_jump) {
3469 STORE_JUMP(jump_past_alt, fixup_alt_jump, buf_end);
3471 if (!COMPILE_STACK_EMPTY) {
3475 /* If we don't want backtracking, force success
3476 the first time we reach the end of the compiled pattern. */
3477 if (syntax & RE_NO_POSIX_BACKTRACKING) {
3480 xfree(compile_stack.stack);
3482 /* We have succeeded; set the length of the buffer. */
3483 bufp->used = buf_end - bufp->buffer;
3485 #ifdef REGEX_DEBUG_FLAG
3487 DEBUG_PRINT1("\nCompiled pattern: \n");
3488 print_compiled_pattern(bufp);
3492 #ifndef MATCH_MAY_ALLOCATE
3493 /* Initialize the failure stack to the largest possible stack. This
3494 isn't necessary unless we're trying to avoid calling alloca in
3495 the search and match routines. */
3497 int num_regs = bufp->re_ngroups + 1;
3499 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
3500 is strictly greater than re_max_failures, the largest possible stack
3501 is 2 * re_max_failures failure points. */
3502 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS)) {
3504 (2 * re_max_failures * MAX_FAILURE_ITEMS);
3506 if (!fail_stack.stack) {
3508 (fail_stack_elt_t *)
3511 sizeof(fail_stack_elt_t));
3514 (fail_stack_elt_t *)
3515 xrealloc(fail_stack.stack,
3517 sizeof(fail_stack_elt_t)));
3521 regex_grow_registers(num_regs);
3523 #endif /* not MATCH_MAY_ALLOCATE */
3529 /* Subroutines for `regex_compile'. */
3531 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3533 static void store_op1(re_opcode_t op, unsigned char *loc, int arg)
3535 *loc = (unsigned char)op;
3536 STORE_NUMBER(loc + 1, arg);
3539 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3541 static void store_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2)
3543 *loc = (unsigned char)op;
3544 STORE_NUMBER(loc + 1, arg1);
3545 STORE_NUMBER(loc + 3, arg2);
3548 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3549 for OP followed by two-byte integer parameter ARG. */
3552 insert_op1(re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
3554 REGISTER unsigned char *pfrom = end;
3555 REGISTER unsigned char *pto = end + 3;
3557 while (pfrom != loc)
3560 store_op1(op, loc, arg);
3563 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3566 insert_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2,
3569 REGISTER unsigned char *pfrom = end;
3570 REGISTER unsigned char *pto = end + 5;
3572 while (pfrom != loc)
3575 store_op2(op, loc, arg1, arg2);
3578 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3579 after an alternative or a begin-subexpression. We assume there is at
3580 least one character before the ^. */
3583 at_begline_loc_p(re_char *pattern, re_char *p, reg_syntax_t syntax)
3585 re_char *prev = p - 2;
3586 re_bool prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3589 /* After a subexpression? */
3590 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3591 /* After an alternative? */
3593 && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3596 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3597 at least one character after the $, i.e., `P < PEND'. */
3600 at_endline_loc_p(re_char *p, re_char *pend, int syntax)
3603 re_bool next_backslash = *next == '\\';
3604 re_char *next_next = p + 1 < pend ? p + 1 : 0;
3607 /* Before a subexpression? */
3608 (syntax & RE_NO_BK_PARENS ? *next == ')'
3609 : next_backslash && next_next && *next_next == ')')
3610 /* Before an alternative? */
3611 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3612 : next_backslash && next_next && *next_next == '|');
3615 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3616 false if it's not. */
3619 group_in_compile_stack(compile_stack_type compile_stack, regnum_t regnum)
3623 for (this_element = compile_stack.avail - 1;
3624 this_element >= 0; this_element--)
3625 if (compile_stack.stack[this_element].regnum == regnum) {
3631 /* Read the ending character of a range (in a bracket expression) from the
3632 uncompiled pattern *P_PTR (which ends at PEND). We assume the
3633 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
3634 Then we set the translation of all bits between the starting and
3635 ending characters (inclusive) in the compiled pattern B.
3637 Return an error code.
3639 We use these short variable names so we can use the same macros as
3640 `regex_compile' itself. */
3642 static reg_errcode_t
3643 compile_range(re_char ** p_ptr, re_char * pend, RE_TRANSLATE_TYPE translate,
3644 reg_syntax_t syntax, unsigned char *buf_end)
3646 Element_count this_char;
3648 re_char *p = *p_ptr;
3649 int range_start, range_end;
3654 /* Even though the pattern is a signed `char *', we need to fetch
3655 with unsigned char *'s; if the high bit of the pattern character
3656 is set, the range endpoints will be negative if we fetch using a
3659 We also want to fetch the endpoints without translating them; the
3660 appropriate translation is done in the bit-setting loop below. */
3661 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
3662 range_start = ((const unsigned char *)p)[-2];
3663 range_end = ((const unsigned char *)p)[0];
3665 /* Have to increment the pointer into the pattern string, so the
3666 caller isn't still at the ending character. */
3669 /* If the start is after the end, the range is empty. */
3670 if (range_start > range_end)
3671 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3673 /* Here we see why `this_char' has to be larger than an `unsigned
3674 char' -- the range is inclusive, so if `range_end' == 0xff
3675 (assuming 8-bit characters), we would otherwise go into an infinite
3676 loop, since all characters <= 0xff. */
3677 for (this_char = range_start; this_char <= range_end; this_char++) {
3678 SET_LIST_BIT(TRANSLATE(this_char));
3686 static reg_errcode_t
3687 compile_extended_range(re_char ** p_ptr, re_char * pend,
3688 RE_TRANSLATE_TYPE translate,
3689 reg_syntax_t syntax, Lisp_Object rtab)
3691 Emchar this_char, range_start, range_end;
3697 p = (const Bufbyte *)*p_ptr;
3698 range_end = charptr_emchar(p);
3699 p--; /* back to '-' */
3700 DEC_CHARPTR(p); /* back to start of range */
3701 /* We also want to fetch the endpoints without translating them; the
3702 appropriate translation is done in the bit-setting loop below. */
3703 range_start = charptr_emchar(p);
3704 INC_CHARPTR(*p_ptr);
3706 /* If the start is after the end, the range is empty. */
3707 if (range_start > range_end)
3708 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3710 /* Can't have ranges spanning different charsets, except maybe for
3711 ranges entirely within the first 256 chars. */
3713 if ((range_start >= 0x100 || range_end >= 0x100)
3714 && CHAR_LEADING_BYTE(range_start) != CHAR_LEADING_BYTE(range_end))
3715 return REG_ERANGESPAN;
3717 /* As advertised, translations only work over the 0 - 0x7F range.
3718 Making this kind of stuff work generally is much harder.
3719 Iterating over the whole range like this would be way efficient
3720 if the range encompasses 10,000 chars or something. You'd have
3721 to do something like this:
3725 map over translation table in [range_start, range_end] of
3726 (put the mapped range in a;
3727 put the translation in b)
3728 invert the range in a and truncate to [range_start, range_end]
3729 compute the union of a, b
3730 union the result into rtab
3732 for (this_char = range_start;
3733 this_char <= range_end && this_char < 0x80; this_char++) {
3734 SET_RANGETAB_BIT(TRANSLATE(this_char));
3737 if (this_char <= range_end)
3738 put_range_table(rtab, this_char, range_end, Qt);
3745 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3746 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3747 characters can start a string that matches the pattern. This fastmap
3748 is used by re_search to skip quickly over impossible starting points.
3750 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3751 area as BUFP->fastmap.
3753 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3756 Returns 0 if we succeed, -2 if an internal error. */
3758 int re_compile_fastmap(struct re_pattern_buffer *bufp)
3761 #ifdef MATCH_MAY_ALLOCATE
3762 fail_stack_type fail_stack;
3764 DECLARE_DESTINATION;
3765 /* We don't push any register information onto the failure stack. */
3767 REGISTER char *fastmap = bufp->fastmap;
3768 unsigned char *pattern = bufp->buffer;
3769 unsigned long size = bufp->used;
3770 /* actually p supposed to carry the const qualifier, however, some
3771 silly mule code below CHANGES p and hence we cant go with the const
3772 qualifier here, leaving an `unfixable' warning on the way */
3774 unsigned char *p = pattern;
3776 re_char *p = pattern;
3778 REGISTER unsigned char *pend = pattern + size;
3781 /* This holds the pointer to the failure stack, when
3782 it is allocated relocatably. */
3783 fail_stack_elt_t *failure_stack_ptr;
3786 /* Assume that each path through the pattern can be null until
3787 proven otherwise. We set this false at the bottom of switch
3788 statement, to which we get only if a particular path doesn't
3789 match the empty string. */
3790 re_bool path_can_be_null = true;
3792 /* We aren't doing a `succeed_n' to begin with. */
3793 re_bool succeed_n_p = false;
3795 assert(fastmap != NULL && p != NULL);
3798 memset(fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3799 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3800 bufp->can_be_null = 0;
3803 if (p == pend || *p == succeed) {
3804 /* We have reached the (effective) end of pattern. */
3805 if (!FAIL_STACK_EMPTY()) {
3806 bufp->can_be_null |= path_can_be_null;
3808 /* Reset for next path. */
3809 path_can_be_null = true;
3811 /* fuck, p isnt const thanks to that
3812 * unified range table function below */
3814 #pragma GCC diagnostic push
3815 #pragma GCC diagnostic ignored "-Wcast-qual"
3816 p = (unsigned char*)fail_stack.
3817 stack[--fail_stack.avail].pointer;
3818 #pragma GCC diagnostic pop
3820 p = fail_stack.stack[--fail_stack.avail]
3830 /* We should never be about to go beyond the end of the pattern. */
3833 switch (SWITCH_ENUM_CAST((re_opcode_t) * p++)) {
3835 /* I guess the idea here is to simply not bother with a
3836 fastmap if a backreference is used, since it's too
3837 hard to figure out the fastmap for the corresponding
3838 group. Setting `can_be_null' stops `re_search_2'
3839 from using the fastmap, so that is all we do. */
3841 bufp->can_be_null = 1;
3844 /* Following are the cases which match a character.
3845 These end with `break'. */
3852 /* XEmacs: Under Mule, these bit vectors will
3853 only contain values for characters below 0x80. */
3854 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3855 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3860 /* Chars beyond end of map must be allowed. */
3862 for (j = *p * BYTEWIDTH; j < 0x80; j++)
3864 /* And all extended characters must be allowed, too. */
3865 for (j = 0x80; j < 0xA0; j++)
3867 #else /* not MULE */
3868 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3872 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3874 (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3879 case charset_mule: {
3883 nentries = unified_range_table_nentries(p);
3884 for (i = 0; i < nentries; i++) {
3885 EMACS_INT first, last;
3886 Lisp_Object dummy_val;
3888 Bufbyte strr[MAX_EMCHAR_LEN];
3890 unified_range_table_get_range(p, i,
3895 jj <= last && jj < 0x80; jj++)
3897 /* Ranges below 0x100 can span charsets, but there
3898 are only two (Control-1 and Latin-1), and
3899 either first or last has to be in them. */
3900 set_charptr_emchar(strr, first);
3903 set_charptr_emchar(strr, last);
3910 case charset_mule_not: {
3914 nentries = unified_range_table_nentries(p);
3915 for (i = 0; i < nentries; i++) {
3916 EMACS_INT first, last;
3917 Lisp_Object dummy_val;
3919 int smallest_prev = 0;
3921 unified_range_table_get_range(p, i,
3925 for (jj = smallest_prev;
3926 jj < first && jj < 0x80; jj++)
3928 smallest_prev = last + 1;
3929 if (smallest_prev >= 0x80)
3932 /* Calculating which leading bytes are actually allowed
3933 here is rather difficult, so we just punt and allow
3935 for (i = 0x80; i < 0xA0; i++)
3946 for (j = 0; j < (1 << BYTEWIDTH); j++)
3949 (regex_emacs_buffer->mirror_syntax_table),
3958 goto matchnotsyntax;
3960 for (j = 0; j < (1 << BYTEWIDTH); j++)
3963 (regex_emacs_buffer->mirror_syntax_table),
3970 int fastmap_newline = fastmap['\n'];
3972 /* `.' matches anything ... */
3974 /* "anything" only includes bytes that can be the
3975 first byte of a character. */
3976 for (j = 0; j < 0xA0; j++)
3979 for (j = 0; j < (1 << BYTEWIDTH); j++)
3983 /* ... except perhaps newline. */
3984 if (!(bufp->syntax & RE_DOT_NEWLINE))
3985 fastmap['\n'] = fastmap_newline;
3987 /* Return if we have already set `can_be_null'; if we
3988 have, then the fastmap is irrelevant. Something's
3990 else if (bufp->can_be_null)
3993 /* Otherwise, have to check alternative paths. */
4004 /* This match depends on text properties. These end with
4005 aborting optimizations. */
4006 bufp->can_be_null = 1;
4012 for (j = 0; j < 0x80; j++)
4015 (regex_emacs_buffer->mirror_syntax_table),
4016 j) == (enum syntaxcode)k)
4018 for (j = 0x80; j < 0xA0; j++) {
4019 if (LEADING_BYTE_PREFIX_P(j))
4020 /* too complicated to calculate this right */
4026 cset = CHARSET_BY_LEADING_BYTE(j);
4027 if (CHARSETP(cset)) {
4029 (regex_emacs_buffer, cset,
4031 == Sword || multi_p)
4036 #else /* not MULE */
4037 for (j = 0; j < (1 << BYTEWIDTH); j++)
4040 (regex_emacs_buffer->mirror_syntax_table),
4041 j) == (enum syntaxcode)k)
4048 for (j = 0; j < 0x80; j++)
4051 (regex_emacs_buffer->mirror_syntax_table),
4052 j) != (enum syntaxcode)k)
4054 for (j = 0x80; j < 0xA0; j++) {
4055 if (LEADING_BYTE_PREFIX_P(j))
4056 /* too complicated to calculate this right */
4062 cset = CHARSET_BY_LEADING_BYTE(j);
4063 if (CHARSETP(cset)) {
4065 (regex_emacs_buffer, cset,
4067 != Sword || multi_p)
4072 #else /* not MULE */
4073 for (j = 0; j < (1 << BYTEWIDTH); j++)
4076 (regex_emacs_buffer->mirror_syntax_table),
4077 j) != (enum syntaxcode)k)
4084 /* 97/2/17 jhod category patch */
4086 case notcategoryspec:
4087 bufp->can_be_null = 1;
4089 /* end if category patch */
4092 /* All cases after this match the empty string. These
4093 end with `continue'. */
4112 case push_dummy_failure:
4116 case pop_failure_jump:
4117 case maybe_pop_jump:
4120 case dummy_failure_jump:
4121 EXTRACT_NUMBER_AND_INCR(j, p);
4126 /* Jump backward implies we just went through the body
4127 of a loop and matched nothing. Opcode jumped to
4128 should be `on_failure_jump' or `succeed_n'. Just
4129 treat it like an ordinary jump. For a * loop, it has
4130 pushed its failure point already; if so, discard that
4132 if ((re_opcode_t) * p != on_failure_jump
4133 && (re_opcode_t) * p != succeed_n)
4137 EXTRACT_NUMBER_AND_INCR(j, p);
4140 /* If what's on the stack is where we are now, pop
4142 if (!FAIL_STACK_EMPTY()
4143 && fail_stack.stack[fail_stack.avail - 1].pointer ==
4149 case on_failure_jump:
4150 case on_failure_keep_string_jump:
4151 handle_on_failure_jump:
4152 EXTRACT_NUMBER_AND_INCR(j, p);
4154 /* For some patterns, e.g., `(a?)?', `p+j' here points
4155 to the end of the pattern. We don't want to push
4156 such a point, since when we restore it above,
4157 entering the switch will increment `p' past the end
4158 of the pattern. We don't need to push such a point
4159 since we obviously won't find any more fastmap
4160 entries beyond `pend'. Such a pattern can match the
4161 null string, though. */
4163 if (!PUSH_PATTERN_OP(p + j, fail_stack)) {
4168 bufp->can_be_null = 1;
4171 EXTRACT_NUMBER_AND_INCR(k, p); /* Skip the n. */
4172 succeed_n_p = false;
4178 /* Get to the number of times to succeed. */
4181 /* Increment p past the n for when k != 0. */
4182 EXTRACT_NUMBER_AND_INCR(k, p);
4185 succeed_n_p = true; /* Spaghetti code alert. */
4186 goto handle_on_failure_jump;
4201 abort(); /* We have listed all the cases. */
4204 /* Getting here means we have found the possible starting
4205 characters for one path of the pattern -- and that the empty
4206 string does not match. We need not follow this path further.
4207 Instead, look at the next alternative (remembered on the
4208 stack), or quit if no more. The test at the top of the loop
4209 does these things. */
4210 path_can_be_null = false;
4214 /* Set `can_be_null' for the last path (also the first path, if the
4215 pattern is empty). */
4216 bufp->can_be_null |= path_can_be_null;
4221 } /* re_compile_fastmap */
4223 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4224 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4225 this memory for recording register information. STARTS and ENDS
4226 must be allocated using the malloc library routine, and must each
4227 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4229 If NUM_REGS == 0, then subsequent matches should allocate their own
4232 Unless this function is called, the first search or match using
4233 PATTERN_BUFFER will allocate its own register data, without
4234 freeing the old data. */
4237 re_set_registers(struct re_pattern_buffer *bufp, struct re_registers *regs,
4238 unsigned num_regs, regoff_t * starts, regoff_t * ends)
4241 bufp->regs_allocated = REGS_REALLOCATE;
4242 regs->num_regs = num_regs;
4243 regs->start = starts;
4246 bufp->regs_allocated = REGS_UNALLOCATED;
4248 regs->start = regs->end = (regoff_t *) 0;
4252 /* Searching routines. */
4254 /* Like re_search_2, below, but only one string is specified, and
4255 doesn't let you say where to stop matching. */
4258 re_search(struct re_pattern_buffer *bufp, const char *string, int size,
4259 int startpos, int range, struct re_registers *regs)
4261 return re_search_2(bufp, NULL, 0, string, size, startpos, range,
4266 /* Snarfed from src/lisp.h, needed for compiling [ce]tags. */
4267 # define bytecount_to_charcount(ptr, len) (len)
4268 # define charcount_to_bytecount(ptr, len) (len)
4269 typedef int Charcount;
4272 /* Using the compiled pattern in BUFP->buffer, first tries to match the
4273 virtual concatenation of STRING1 and STRING2, starting first at index
4274 STARTPOS, then at STARTPOS + 1, and so on.
4276 With MULE, STARTPOS is a byte position, not a char position. And the
4277 search will increment STARTPOS by the width of the current leading
4280 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
4282 RANGE is how far to scan while trying to match. RANGE = 0 means try
4283 only at STARTPOS; in general, the last start tried is STARTPOS +
4286 With MULE, RANGE is a byte position, not a char position. The last
4287 start tried is the character starting <= STARTPOS + RANGE.
4289 In REGS, return the indices of the virtual concatenation of STRING1
4290 and STRING2 that matched the entire BUFP->buffer and its contained
4293 Do not consider matching one past the index STOP in the virtual
4294 concatenation of STRING1 and STRING2.
4296 We return either the position in the strings at which the match was
4297 found, -1 if no match, or -2 if error (such as failure
4301 re_search_2(struct re_pattern_buffer *bufp, const char *str1,
4302 int size1, const char *str2, int size2, int startpos,
4303 int range, struct re_registers *regs, int stop)
4306 re_char *string1 = (re_char *) str1;
4307 re_char *string2 = (re_char *) str2;
4308 REGISTER char *fastmap = bufp->fastmap;
4309 REGISTER RE_TRANSLATE_TYPE translate = bufp->translate;
4310 int total_size = size1 + size2;
4311 int endpos = startpos + range;
4312 #ifdef REGEX_BEGLINE_CHECK
4313 int anchored_at_begline = 0;
4318 /* Check for out-of-range STARTPOS. */
4319 if (startpos < 0 || startpos > total_size)
4322 /* Fix up RANGE if it might eventually take us outside
4323 the virtual concatenation of STRING1 and STRING2. */
4325 range = 0 - startpos;
4326 else if (endpos > total_size)
4327 range = total_size - startpos;
4329 /* If the search isn't to be a backwards one, don't waste time in a
4330 search for a pattern that must be anchored. */
4331 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf
4336 d = ((const unsigned char *)
4338 size1 ? string2 - size1 : string1) + startpos);
4339 range = charcount_to_bytecount(d, 1);
4343 /* In a forward search for something that starts with \=.
4344 don't keep searching past point. */
4345 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot
4348 BUF_PT(regex_emacs_buffer) - BUF_BEGV(regex_emacs_buffer)
4355 /* Update the fastmap now if not correct already. */
4356 if (fastmap && !bufp->fastmap_accurate)
4357 if (re_compile_fastmap(bufp) == -2)
4360 #ifdef REGEX_BEGLINE_CHECK
4362 unsigned long i = 0;
4364 while (i < bufp->used) {
4365 if (bufp->buffer[i] == start_memory ||
4366 bufp->buffer[i] == stop_memory)
4371 anchored_at_begline = i < bufp->used
4372 && bufp->buffer[i] == begline;
4377 SETUP_SYNTAX_CACHE_FOR_OBJECT(regex_match_object,
4379 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR
4380 (regex_match_object, regex_emacs_buffer,
4384 /* Loop through the string, looking for a place to start matching. */
4386 #ifdef REGEX_BEGLINE_CHECK
4387 /* If the regex is anchored at the beginning of a line (i.e. with a ^),
4388 then we can speed things up by skipping to the next beginning-of-
4390 if (anchored_at_begline && startpos > 0 && startpos != size1 &&
4392 /* whose stupid idea was it anyway to make this
4393 function take two strings to match?? */
4397 if (startpos < size1 && startpos + range >= size1)
4398 lim = range - (size1 - startpos);
4400 d = ((const unsigned char *)
4402 size1 ? string2 - size1 : string1) + startpos);
4403 DEC_CHARPTR(d); /* Ok, since startpos != size1. */
4404 d_size = charcount_to_bytecount(d, 1);
4406 if (TRANSLATE_P(translate))
4407 while (range > lim && *d != '\n') {
4408 d += d_size; /* Speedier INC_CHARPTR(d) */
4409 d_size = charcount_to_bytecount(d, 1);
4412 while (range > lim && *d != '\n') {
4413 d += d_size; /* Speedier INC_CHARPTR(d) */
4414 d_size = charcount_to_bytecount(d, 1);
4418 startpos += irange - range;
4420 #endif /* REGEX_BEGLINE_CHECK */
4422 /* If a fastmap is supplied, skip quickly over characters that
4423 cannot be the start of a match. If the pattern can match the
4424 null string, however, we don't need to skip characters; we want
4425 the first null string. */
4426 if (fastmap && startpos < total_size && !bufp->can_be_null) {
4427 if (range > 0) { /* Searching forwards. */
4431 if (startpos < size1
4432 && startpos + range >= size1)
4433 lim = range - (size1 - startpos);
4435 d = ((const unsigned char *)
4437 size1 ? string2 - size1 : string1) +
4440 /* Written out as an if-else to avoid testing `translate'
4442 if (TRANSLATE_P(translate))
4443 while (range > lim) {
4446 Bufbyte str[MAX_EMCHAR_LEN];
4448 buf_ch = charptr_emchar(d);
4449 buf_ch = RE_TRANSLATE(buf_ch);
4450 set_charptr_emchar(str, buf_ch);
4452 || fastmap[(unsigned char)
4462 charcount_to_bytecount(d,
4465 d += d_size; /* Speedier INC_CHARPTR(d) */
4467 while (range > lim && !fastmap[*d]) {
4469 charcount_to_bytecount(d,
4472 d += d_size; /* Speedier INC_CHARPTR(d) */
4475 startpos += irange - range;
4476 } else { /* Searching backwards. */
4478 Emchar c = (size1 == 0 || startpos >= size1
4479 ? charptr_emchar(string2 +
4481 : charptr_emchar(string1 +
4485 if (!(c >= 0200 || fastmap[(unsigned char)c]))
4488 if (!fastmap[(unsigned char)c])
4494 /* If can't match the null string, and that's all we have left, fail. */
4495 if (range >= 0 && startpos == total_size && fastmap
4496 && !bufp->can_be_null)
4499 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4500 if (!no_quit_in_re_search)
4503 val = re_match_2_internal(bufp, string1, size1, string2, size2,
4504 startpos, regs, stop);
4505 #ifndef REGEX_MALLOC
4520 else if (range > 0) {
4521 d = ((const unsigned char *)
4523 size1 ? string2 - size1 : string1) + startpos);
4524 d_size = charcount_to_bytecount(d, 1);
4528 /* Note startpos > size1 not >=. If we are on the
4529 string1/string2 boundary, we want to backup into string1. */
4530 d = ((const unsigned char *)
4532 size1 ? string2 - size1 : string1) + startpos);
4534 d_size = charcount_to_bytecount(d, 1);
4542 /* Declarations and macros for re_match_2. */
4544 /* This converts PTR, a pointer into one of the search strings `string1'
4545 and `string2' into an offset from the beginning of that string. */
4546 #define POINTER_TO_OFFSET(ptr) \
4547 (FIRST_STRING_P (ptr) \
4548 ? ((regoff_t) ((ptr) - string1)) \
4549 : ((regoff_t) ((ptr) - string2 + size1)))
4551 /* Macros for dealing with the split strings in re_match_2. */
4553 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
4555 /* Call before fetching a character with *d. This switches over to
4556 string2 if necessary. */
4557 #define REGEX_PREFETCH() \
4560 /* End of string2 => fail. */ \
4561 if (dend == end_match_2) \
4563 /* End of string1 => advance to string2. */ \
4565 dend = end_match_2; \
4568 /* Test if at very beginning or at very end of the virtual concatenation
4569 of `string1' and `string2'. If only one string, it's `string2'. */
4570 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4571 #define AT_STRINGS_END(d) ((d) == end2)
4574 If the given position straddles the string gap, return the equivalent
4575 position that is before or after the gap, respectively; otherwise,
4576 return the same position. */
4577 #define POS_BEFORE_GAP_UNSAFE(d) ((d) == string2 ? end1 : (d))
4578 #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d))
4580 /* Test if CH is a word-constituent character. (XEmacs change) */
4581 #define WORDCHAR_P_UNSAFE(ch) \
4582 (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), \
4585 /* Free everything we malloc. */
4586 #ifdef MATCH_MAY_ALLOCATE
4587 #define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
4588 #define FREE_VARIABLES() \
4590 REGEX_FREE_STACK (fail_stack.stack); \
4591 FREE_VAR (regstart); \
4592 FREE_VAR (regend); \
4593 FREE_VAR (old_regstart); \
4594 FREE_VAR (old_regend); \
4595 FREE_VAR (best_regstart); \
4596 FREE_VAR (best_regend); \
4597 FREE_VAR (reg_info); \
4598 FREE_VAR (reg_dummy); \
4599 FREE_VAR (reg_info_dummy); \
4601 #else /* not MATCH_MAY_ALLOCATE */
4602 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4603 #endif /* MATCH_MAY_ALLOCATE */
4605 /* These values must meet several constraints. They must not be valid
4606 register values; since we have a limit of 255 registers (because
4607 we use only one byte in the pattern for the register number), we can
4608 use numbers larger than 255. They must differ by 1, because of
4609 NUM_FAILURE_ITEMS above. And the value for the lowest register must
4610 be larger than the value for the highest register, so we do not try
4611 to actually save any registers when none are active. */
4612 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
4613 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
4615 /* Matching routines. */
4617 #ifndef emacs /* Emacs never uses this. */
4618 /* re_match is like re_match_2 except it takes only a single string. */
4621 re_match(struct re_pattern_buffer *bufp, const char *string, int size,
4622 int pos, struct re_registers *regs)
4625 re_match_2_internal(bufp, NULL, 0, (re_char *) string, size,
4630 #endif /* not emacs */
4632 /* re_match_2 matches the compiled pattern in BUFP against the
4633 (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 and
4634 SIZE2, respectively). We start matching at POS, and stop matching
4637 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4638 store offsets for the substring each group matched in REGS. See the
4639 documentation for exactly how many groups we fill.
4641 We return -1 if no match, -2 if an internal error (such as the
4642 failure stack overflowing). Otherwise, we return the length of the
4643 matched substring. */
4646 re_match_2(struct re_pattern_buffer *bufp, const char *string1,
4647 int size1, const char *string2, int size2, int pos,
4648 struct re_registers *regs, int stop)
4653 SETUP_SYNTAX_CACHE_FOR_OBJECT(regex_match_object,
4655 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR
4656 (regex_match_object, regex_emacs_buffer,
4660 result = re_match_2_internal(bufp, (re_char *) string1, size1,
4661 (re_char *) string2, size2,
4668 /* This is a separate function so that we can force an alloca cleanup
4671 re_match_2_internal(struct re_pattern_buffer *bufp, re_char * string1,
4672 int size1, re_char * string2, int size2, int pos,
4673 struct re_registers *regs, int stop)
4675 /* General temporaries. */
4678 int should_succeed; /* XEmacs change */
4680 /* Just past the end of the corresponding string. */
4681 re_char *end1, *end2;
4683 /* Pointers into string1 and string2, just past the last characters in
4684 each to consider matching. */
4685 re_char *end_match_1, *end_match_2;
4687 /* Where we are in the data, and the end of the current string. */
4688 const re_char *d, *dend;
4690 /* Where we are in the pattern, and the end of the pattern. */
4691 unsigned char *p = bufp->buffer;
4692 REGISTER unsigned char *pend = p + bufp->used;
4694 /* Mark the opcode just after a start_memory, so we can test for an
4695 empty subpattern when we get to the stop_memory. */
4696 re_char *just_past_start_mem = 0;
4698 /* We use this to map every character in the string. */
4699 RE_TRANSLATE_TYPE translate = bufp->translate;
4701 /* Failure point stack. Each place that can handle a failure further
4702 down the line pushes a failure point on this stack. It consists of
4703 restart, regend, and reg_info for all registers corresponding to
4704 the subexpressions we're currently inside, plus the number of such
4705 registers, and, finally, two char *'s. The first char * is where
4706 to resume scanning the pattern; the second one is where to resume
4707 scanning the strings. If the latter is zero, the failure point is
4708 a ``dummy''; if a failure happens and the failure point is a dummy,
4709 it gets discarded and the next one is tried. */
4710 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4711 fail_stack_type fail_stack;
4713 #ifdef REGEX_DEBUG_FLAG
4714 static unsigned failure_id;
4715 int nfailure_points_pushed = 0, nfailure_points_popped = 0;
4719 /* This holds the pointer to the failure stack, when
4720 it is allocated relocatably. */
4721 fail_stack_elt_t *failure_stack_ptr;
4724 /* We fill all the registers internally, independent of what we
4725 return, for use in backreferences. The number here includes
4726 an element for register zero. */
4727 int num_regs = bufp->re_ngroups + 1;
4729 /* The currently active registers. */
4730 int lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4731 int highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4733 /* Information on the contents of registers. These are pointers into
4734 the input strings; they record just what was matched (on this
4735 attempt) by a subexpression part of the pattern, that is, the
4736 regnum-th regstart pointer points to where in the pattern we began
4737 matching and the regnum-th regend points to right after where we
4738 stopped matching the regnum-th subexpression. (The zeroth register
4739 keeps track of what the whole pattern matches.) */
4740 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4741 re_char **regstart, **regend;
4744 /* If a group that's operated upon by a repetition operator fails to
4745 match anything, then the register for its start will need to be
4746 restored because it will have been set to wherever in the string we
4747 are when we last see its open-group operator. Similarly for a
4749 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4750 re_char **old_regstart, **old_regend;
4753 /* The is_active field of reg_info helps us keep track of which (possibly
4754 nested) subexpressions we are currently in. The matched_something
4755 field of reg_info[reg_num] helps us tell whether or not we have
4756 matched any of the pattern so far this time through the reg_num-th
4757 subexpression. These two fields get reset each time through any
4758 loop their register is in. */
4759 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4760 register_info_type *reg_info;
4763 /* The following record the register info as found in the above
4764 variables when we find a match better than any we've seen before.
4765 This happens as we backtrack through the failure points, which in
4766 turn happens only if we have not yet matched the entire string. */
4767 unsigned best_regs_set = false;
4768 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4769 re_char **best_regstart, **best_regend;
4772 /* Logically, this is `best_regend[0]'. But we don't want to have to
4773 allocate space for that if we're not allocating space for anything
4774 else (see below). Also, we never need info about register 0 for
4775 any of the other register vectors, and it seems rather a kludge to
4776 treat `best_regend' differently than the rest. So we keep track of
4777 the end of the best match so far in a separate variable. We
4778 initialize this to NULL so that when we backtrack the first time
4779 and need to test it, it's not garbage. */
4780 re_char *match_end = NULL;
4782 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
4783 int set_regs_matched_done = 0;
4785 /* Used when we pop values we don't care about. */
4786 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4787 re_char **reg_dummy;
4788 register_info_type *reg_info_dummy;
4791 #ifdef REGEX_DEBUG_FLAG
4792 /* Counts the total number of registers pushed. */
4793 unsigned num_regs_pushed = 0;
4796 /* 1 if this match ends in the same string (string1 or string2)
4797 as the best previous match. */
4800 /* 1 if this match is the best seen so far. */
4801 re_bool best_match_p;
4803 DEBUG_PRINT1("\n\nEntering re_match_2.\n");
4807 #ifdef MATCH_MAY_ALLOCATE
4808 /* Do not bother to initialize all the register variables if there are
4809 no groups in the pattern, as it takes a fair amount of time. If
4810 there are groups, we include space for register 0 (the whole
4811 pattern), even though we never use it, since it simplifies the
4812 array indexing. We should fix this. */
4813 if (bufp->re_ngroups) {
4814 regstart = REGEX_TALLOC(num_regs, re_char *);
4815 regend = REGEX_TALLOC(num_regs, re_char *);
4816 old_regstart = REGEX_TALLOC(num_regs, re_char *);
4817 old_regend = REGEX_TALLOC(num_regs, re_char *);
4818 best_regstart = REGEX_TALLOC(num_regs, re_char *);
4819 best_regend = REGEX_TALLOC(num_regs, re_char *);
4820 reg_info = REGEX_TALLOC(num_regs, register_info_type);
4821 reg_dummy = REGEX_TALLOC(num_regs, re_char *);
4822 reg_info_dummy = REGEX_TALLOC(num_regs, register_info_type);
4825 (regstart && regend && old_regstart && old_regend
4826 && reg_info && best_regstart && best_regend && reg_dummy
4827 && reg_info_dummy)) {
4832 /* We must initialize all our variables to NULL, so that
4833 `FREE_VARIABLES' doesn't try to free them. */
4834 regstart = regend = old_regstart = old_regend = best_regstart
4835 = best_regend = reg_dummy = NULL;
4836 reg_info = reg_info_dummy = (register_info_type *) NULL;
4838 #endif /* MATCH_MAY_ALLOCATE */
4840 /* The starting position is bogus. */
4841 if (pos < 0 || pos > size1 + size2) {
4846 /* Initialize subexpression text positions to -1 to mark ones that no
4847 start_memory/stop_memory has been seen for. Also initialize the
4848 register information struct. */
4849 for (mcnt = 1; mcnt < num_regs; mcnt++) {
4850 regstart[mcnt] = regend[mcnt]
4851 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
4853 REG_MATCH_NULL_STRING_P(reg_info[mcnt]) =
4854 MATCH_NULL_UNSET_VALUE;
4855 IS_ACTIVE(reg_info[mcnt]) = 0;
4856 MATCHED_SOMETHING(reg_info[mcnt]) = 0;
4857 EVER_MATCHED_SOMETHING(reg_info[mcnt]) = 0;
4859 /* We move `string1' into `string2' if the latter's empty -- but not if
4860 `string1' is null. */
4861 if (size2 == 0 && string1 != NULL) {
4867 end1 = string1 + size1;
4868 end2 = string2 + size2;
4870 /* Compute where to stop matching, within the two strings. */
4871 if (stop <= size1) {
4872 end_match_1 = string1 + stop;
4873 end_match_2 = string2;
4876 end_match_2 = string2 + stop - size1;
4879 /* `p' scans through the pattern as `d' scans through the data.
4880 `dend' is the end of the input string that `d' points within. `d'
4881 is advanced into the following input string whenever necessary, but
4882 this happens before fetching; therefore, at the beginning of the
4883 loop, `d' can be pointing at the end of a string, but it cannot
4885 if (size1 > 0 && pos <= size1) {
4889 d = string2 + pos - size1;
4893 DEBUG_PRINT1("The compiled pattern is: \n");
4894 DEBUG_PRINT_COMPILED_PATTERN(bufp, p, pend);
4895 DEBUG_PRINT1("The string to match is: `");
4896 DEBUG_PRINT_DOUBLE_STRING(d, string1, size1, string2, size2);
4897 DEBUG_PRINT1("'\n");
4899 /* This loops over pattern commands. It exits by returning from the
4900 function if the match is complete, or it drops through if the match
4901 fails at this starting point in the input data. */
4903 DEBUG_PRINT2("\n0x%lx: ", (long)p);
4904 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4905 if (!no_quit_in_re_search)
4910 /* End of pattern means we might have succeeded. */
4911 DEBUG_PRINT1("end of pattern ... ");
4913 /* If we haven't matched the entire string, and we want
4914 the longest match, try backtracking. */
4915 if (d != end_match_2) {
4916 same_str_p = (FIRST_STRING_P(match_end)
4917 == MATCHING_IN_FIRST_STRING);
4919 /* AIX compiler got confused when this was
4920 combined with the previous declaration. */
4922 best_match_p = d > match_end;
4925 !MATCHING_IN_FIRST_STRING;
4927 DEBUG_PRINT1("backtracking.\n");
4929 if (!FAIL_STACK_EMPTY()) {
4930 /* More failure points to try. */
4932 /* If exceeds best match so far, save
4934 if (!best_regs_set || best_match_p) {
4935 best_regs_set = true;
4939 ("\nSAVING match as best so far.\n");
4941 for (mcnt = 1; mcnt < num_regs;
4943 best_regstart[mcnt] =
4952 /* If no failure points, don't restore garbage.
4953 And if last match is real best match, don't
4954 restore second best one. */
4955 else if (best_regs_set && !best_match_p) {
4957 /* Restore best match. It may happen
4958 that `dend == end_match_1' while the
4959 restored d is in string2. For
4960 example, the pattern `x.*y.*z'
4961 against the strings `x-' and `y-z-',
4962 if the two strings are not
4963 consecutive in memory. */
4965 ("Restoring best registers.\n");
4968 dend = ((d >= string1 && d <= end1)
4969 ? end_match_1 : end_match_2);
4971 for (mcnt = 1; mcnt < num_regs; mcnt++) {
4973 best_regstart[mcnt];
4979 /* d != end_match_2 */
4981 DEBUG_PRINT1("Accepting match.\n");
4983 int num_nonshy_regs = bufp->re_nsub + 1;
4984 /* If caller wants register contents data back,
4986 if (regs && !bufp->no_sub) {
4987 /* Have the register data arrays been
4989 if (bufp->regs_allocated == REGS_UNALLOCATED) {
4990 /* No. So allocate them with malloc.
4991 We need one extra element beyond
4992 `num_regs' for the `-1' marker GNU
4994 regs->num_regs = MAX(RE_NREGS, num_nonshy_regs + 1);
4995 regs->start = TALLOC(regs->num_regs, regoff_t);
4996 regs->end = TALLOC(regs->num_regs, regoff_t);
4997 if (regs->start == NULL || regs->end == NULL) {
5001 bufp->regs_allocated = REGS_REALLOCATE;
5002 } else if (bufp->regs_allocated == REGS_REALLOCATE) {
5003 /* Yes. If we need more elements than were already
5004 allocated, reallocate them. If we need fewer, just
5006 if (regs->num_regs < num_nonshy_regs + 1) {
5007 regs->num_regs = num_nonshy_regs + 1;
5008 RETALLOC(regs->start, regs->num_regs, regoff_t);
5009 RETALLOC(regs->end, regs->num_regs, regoff_t);
5010 if (regs->start == NULL || regs->end == NULL) {
5016 /* The braces fend off a "empty body in an else-statement"
5017 warning under GCC when assert expands to nothing. */
5018 assert (bufp->regs_allocated == REGS_FIXED);
5021 /* Convert the pointer data in `regstart' and `regend' to
5022 indices. Register zero has to be set differently,
5023 since we haven't kept track of any info for it. */
5024 if (regs->num_regs > 0) {
5025 regs->start[0] = pos;
5026 regs->end[0] = (MATCHING_IN_FIRST_STRING
5027 ? ((regoff_t) (d - string1))
5028 : ((regoff_t) (d - string2 + size1)));
5031 /* Map over the NUM_NONSHY_REGS non-shy internal registers.
5032 Copy each into the corresponding external register.
5033 N.B. MCNT indexes external registers. */
5035 mcnt < MIN (num_nonshy_regs, regs->num_regs);
5037 int ireg = bufp->external_to_internal_register[mcnt];
5039 if (REG_UNSET (regstart[ireg]) || REG_UNSET (regend[ireg]))
5040 regs->start[mcnt] = regs->end[mcnt] = -1;
5043 = (regoff_t) POINTER_TO_OFFSET (regstart[ireg]);
5045 = (regoff_t) POINTER_TO_OFFSET (regend[ireg]);
5048 } /* regs && !bufp->no_sub */
5050 /* If we have regs and the regs structure has
5051 more elements than were in the pattern, set
5052 the extra elements to -1. If we
5053 (re)allocated the registers, this is the
5054 case, because we always allocate enough to
5055 have at least one -1 at the end.
5057 We do this even when no_sub is set because
5058 some applications (XEmacs) reuse register
5059 structures which may contain stale
5060 information, and permit attempts to access
5063 It would be possible to require the caller to
5064 do this, but we'd have to change the API for
5065 this function to reflect that, and audit all
5067 if (regs && regs->num_regs > 0)
5068 for (mcnt = num_nonshy_regs; mcnt < regs->num_regs; mcnt++)
5069 regs->start[mcnt] = regs->end[mcnt] = -1;
5072 DEBUG_PRINT4("%u failure points pushed, %u popped (%u remain).\n",
5073 nfailure_points_pushed, nfailure_points_popped,
5074 nfailure_points_pushed - nfailure_points_popped);
5075 DEBUG_PRINT2("%u registers pushed.\n", num_regs_pushed);
5077 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
5081 DEBUG_PRINT2("Returning %d from re_match_2.\n", mcnt);
5087 /* Otherwise match next pattern command. */
5088 switch (SWITCH_ENUM_CAST((re_opcode_t) *p++)) {
5089 /* Ignore these. Used to ignore the n of succeed_n's
5090 which currently have n == 0. */
5092 DEBUG_PRINT1("EXECUTING no_op.\n");
5096 DEBUG_PRINT1("EXECUTING succeed.\n");
5099 /* Match the next n pattern characters exactly. The
5100 following byte in the pattern defines n, and the n
5101 bytes after that are the characters to match. */
5104 DEBUG_PRINT2("EXECUTING exactn %d.\n", mcnt);
5106 /* This is written out as an if-else so we don't waste
5107 time testing `translate' inside the loop. */
5108 if (TRANSLATE_P(translate)) {
5111 Emchar pat_ch, buf_ch;
5115 pat_ch = charptr_emchar(p);
5116 buf_ch = charptr_emchar(d);
5117 if (RE_TRANSLATE(buf_ch) != pat_ch)
5120 pat_len = charcount_to_bytecount(p, 1);
5125 #else /* not MULE */
5127 if ((unsigned char)RE_TRANSLATE(*d++) !=
5145 /* Match any character except possibly a newline or a
5148 DEBUG_PRINT1("EXECUTING anychar.\n");
5152 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5153 && TRANSLATE(*d) == '\n')
5154 || (bufp->syntax & RE_DOT_NOT_NULL
5155 && TRANSLATE(*d) == '\000'))
5159 DEBUG_PRINT2(" Matched `%d'.\n", *d);
5160 INC_CHARPTR(d); /* XEmacs change */
5165 REGISTER unsigned char c;
5167 (re_opcode_t) * (p - 1) == charset_not;
5169 DEBUG_PRINT2("EXECUTING charset%s.\n",
5170 not_p ? "_not" : "");
5173 /* The character to match. */
5176 /* Cast to `unsigned' instead of `unsigned char'
5177 in case the bit list is a full 32 bytes
5179 if (c < (unsigned)(*p * BYTEWIDTH)
5181 c / BYTEWIDTH] & (1 << (c %
5191 INC_CHARPTR(d); /* XEmacs change */
5197 case charset_mule_not: {
5200 (re_opcode_t) * (p - 1) == charset_mule_not;
5202 DEBUG_PRINT2("EXECUTING charset_mule%s.\n",
5203 not_p ? "_not" : "");
5206 c = charptr_emchar((const Bufbyte *)d);
5207 /* The character to match. */
5212 unified_range_table_lookup(p, c, Qnil)))
5215 p += unified_range_table_bytes_used(p);
5226 /* The beginning of a group is represented by
5227 start_memory. The arguments are the register number
5228 in the next byte, and the number of groups inner to
5229 this one in the next. The text matched within the
5230 group is recorded (in the internal registers data
5231 structure) under the register number. */
5233 DEBUG_PRINT3("EXECUTING start_memory %d (%d):\n", *p,
5236 /* Find out if this group can match the empty string. */
5237 p1 = p; /* To send to group_match_null_string_p. */
5239 if (REG_MATCH_NULL_STRING_P(reg_info[*p]) ==
5240 MATCH_NULL_UNSET_VALUE)
5241 REG_MATCH_NULL_STRING_P(reg_info[*p])
5242 = group_match_null_string_p(&p1, pend,
5245 /* Save the position in the string where we were the
5246 last time we were at this open-group operator in case
5247 the group is operated upon by a repetition operator,
5248 e.g., with `(a*)*b' against `ab'; then we want to
5249 ignore where we are now in the string in case this
5250 attempt to match fails. */
5251 old_regstart[*p] = REG_MATCH_NULL_STRING_P(reg_info[*p])
5252 ? REG_UNSET(regstart[*p]) ? d : regstart[*p]
5254 DEBUG_PRINT2(" old_regstart: %d\n",
5255 POINTER_TO_OFFSET(old_regstart[*p]));
5258 DEBUG_PRINT2(" regstart: %d\n",
5259 POINTER_TO_OFFSET(regstart[*p]));
5261 IS_ACTIVE(reg_info[*p]) = 1;
5262 MATCHED_SOMETHING(reg_info[*p]) = 0;
5264 /* Clear this whenever we change the register activity
5266 set_regs_matched_done = 0;
5268 /* This is the new highest active register. */
5269 highest_active_reg = *p;
5271 /* If nothing was active before, this is the new lowest
5273 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5274 lowest_active_reg = *p;
5276 /* Move past the register number and inner group
5279 just_past_start_mem = p;
5283 /* The stop_memory opcode represents the end of a group.
5284 Its arguments are the same as start_memory's: the
5285 register number, and the number of inner groups. */
5287 DEBUG_PRINT3("EXECUTING stop_memory %d (%d):\n", *p,
5290 /* We need to save the string position the last time we
5291 were at this close-group operator in case the group
5292 is operated upon by a repetition operator, e.g., with
5293 `((a*)*(b*)*)*' against `aba'; then we want to ignore
5294 where we are now in the string in case this attempt
5296 old_regend[*p] = REG_MATCH_NULL_STRING_P(reg_info[*p])
5297 ? REG_UNSET(regend[*p]) ? d : regend[*p]
5299 DEBUG_PRINT2(" old_regend: %d\n",
5300 POINTER_TO_OFFSET(old_regend[*p]));
5303 DEBUG_PRINT2(" regend: %d\n",
5304 POINTER_TO_OFFSET(regend[*p]));
5306 /* This register isn't active anymore. */
5307 IS_ACTIVE(reg_info[*p]) = 0;
5309 /* Clear this whenever we change the register activity
5311 set_regs_matched_done = 0;
5313 /* If this was the only register active, nothing is
5315 if (lowest_active_reg == highest_active_reg) {
5316 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5317 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5318 } else { /* We must scan for the new highest
5319 active register, since it isn't
5320 necessarily one less than now:
5321 consider (a(b)c(d(e)f)g). When group
5322 3 ends, after the f), the new highest
5323 active register is 1. */
5324 unsigned char r = *p - 1;
5325 while (r > 0 && !IS_ACTIVE(reg_info[r]))
5328 /* If we end up at register zero, that means
5329 that we saved the registers as the result of
5330 an `on_failure_jump', not a `start_memory',
5331 and we jumped to past the innermost
5332 `stop_memory'. For example, in ((.)*) we
5333 save registers 1 and 2 as a result of the *,
5334 but when we pop back to the second ), we are
5335 at the stop_memory 1. Thus, nothing is
5339 NO_LOWEST_ACTIVE_REG;
5340 highest_active_reg =
5341 NO_HIGHEST_ACTIVE_REG;
5343 highest_active_reg = r;
5345 /* 98/9/21 jhod: We've also gotta set
5346 lowest_active_reg, don't we? */
5348 while (r < highest_active_reg
5349 && !IS_ACTIVE(reg_info[r]))
5351 lowest_active_reg = r;
5355 /* If just failed to match something this time around
5356 with a group that's operated on by a repetition
5357 operator, try to force exit from the ``loop'', and
5358 restore the register information for this group that
5359 we had before trying this last match. */
5360 if ((!MATCHED_SOMETHING(reg_info[*p])
5361 || just_past_start_mem == p - 1)
5362 && (p + 2) < pend) {
5363 re_bool is_a_jump_n = false;
5367 switch ((unsigned int)(re_opcode_t)*p1++) {
5370 case pop_failure_jump:
5371 case maybe_pop_jump:
5373 case dummy_failure_jump:
5374 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
5384 /* If the next operation is a jump backwards in
5385 the pattern to an on_failure_jump right
5386 before the start_memory corresponding to this
5387 stop_memory, exit from the loop by forcing a
5388 failure after pushing on the stack the
5389 on_failure_jump's jump in the pattern, and
5392 && (re_opcode_t) * p1 == on_failure_jump
5393 && (re_opcode_t) p1[3] == start_memory
5395 /* If this group ever matched anything,
5396 then restore what its registers were
5397 before trying this last failed match,
5398 e.g., with `(a*)*b' against `ab' for
5399 regstart[1], and, e.g., with
5400 `((a*)*(b*)*)*' against `aba' for
5403 Also restore the registers for inner
5404 groups for, e.g., `((a*)(b*))*'
5405 against `aba' (register 3 would
5406 otherwise get trashed). */
5408 if (EVER_MATCHED_SOMETHING
5412 EVER_MATCHED_SOMETHING(reg_info
5416 /* Restore this and inner
5419 for (r = *p; r < *p + *(p + 1);
5424 /* xx why this test? */
5425 if (old_regend[r] >=
5433 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
5434 PUSH_FAILURE_POINT(p1 + mcnt, d, -2);
5440 /* Move past the register number and the inner group
5445 /* \<digit> has been turned into a `duplicate' command
5446 which is followed by the numeric value of <digit> as
5447 the register number. (Already passed through
5448 external-to-internal-register mapping, so it refers
5449 to the actual group number, not the non-shy-only
5450 numbering used in the external world.) */
5453 REGISTER re_char *d2, *dend2;
5454 /* Get which register to match against. */
5456 DEBUG_PRINT2("EXECUTING duplicate %d.\n",
5459 /* Can't back reference a group which we've
5461 if (REG_UNSET(regstart[regno])
5462 || REG_UNSET(regend[regno]))
5465 /* Where in input to try to start matching. */
5466 d2 = regstart[regno];
5468 /* Where to stop matching; if both the place to
5469 start and the place to stop matching are in
5470 the same string, then set to the place to
5471 stop, otherwise, for now have to use the end
5472 of the first string. */
5474 dend2 = ((FIRST_STRING_P(regstart[regno])
5475 == FIRST_STRING_P(regend[regno]))
5476 ? regend[regno] : end_match_1);
5478 /* If necessary, advance to next segment
5479 in register contents. */
5480 while (d2 == dend2) {
5481 if (dend2 == end_match_2)
5483 if (dend2 == regend[regno])
5486 /* End of string1 => advance to
5489 dend2 = regend[regno];
5491 /* At end of register contents =>
5496 /* If necessary, advance to next segment
5500 /* How many characters left in this segment to match. */
5503 /* Want how many consecutive characters
5504 we can match in one shot, so, if
5505 necessary, adjust the count. */
5506 if (mcnt > dend2 - d2)
5509 /* Compare that many; failure if
5510 mismatch, else move past them. */
5511 if (TRANSLATE_P(translate)
5513 (const unsigned char*)d,
5514 (const unsigned char*)d2,
5516 : memcmp(d, d2, mcnt)) {
5519 d += mcnt, d2 += mcnt;
5521 /* Do this because we've match some
5528 /* begline matches the empty string at the beginning of
5529 the string (unless `not_bol' is set in `bufp'), and,
5530 if `newline_anchor' is set, after newlines. */
5532 DEBUG_PRINT1("EXECUTING begline.\n");
5534 if (AT_STRINGS_BEG(d)) {
5537 } else if (d[-1] == '\n' && bufp->newline_anchor) {
5540 /* In all other cases, we fail. */
5543 /* endline is the dual of begline. */
5545 DEBUG_PRINT1("EXECUTING endline.\n");
5547 if (AT_STRINGS_END(d)) {
5552 /* We have to ``prefetch'' the next character. */
5553 else if ((d == end1 ? *string2 : *d) == '\n'
5554 && bufp->newline_anchor) {
5559 /* Match at the very beginning of the data. */
5561 DEBUG_PRINT1("EXECUTING begbuf.\n");
5562 if (AT_STRINGS_BEG(d))
5566 /* Match at the very end of the data. */
5568 DEBUG_PRINT1("EXECUTING endbuf.\n");
5569 if (AT_STRINGS_END(d))
5573 /* on_failure_keep_string_jump is used to optimize
5574 `.*\n'. It pushes NULL as the value for the string
5575 on the stack. Then `pop_failure_point' will keep the
5576 current value for the string, instead of restoring
5577 it. To see why, consider matching `foo\nbar' against
5578 `.*\n'. The .* matches the foo; then the . fails
5579 against the \n. But the next thing we want to do is
5580 match the \n against the \n; if we restored the
5581 string value, we would be back at the foo.
5583 Because this is used only in specific cases, we don't
5584 need to check all the things that `on_failure_jump'
5585 does, to make sure the right things get saved on the
5586 stack. Hence we don't share its code. The only
5587 reason to push anything on the stack at all is that
5588 otherwise we would have to change `anychar's code to
5589 do something besides goto fail in this case; that
5590 seems worse than this. */
5591 case on_failure_keep_string_jump:
5592 DEBUG_PRINT1("EXECUTING on_failure_keep_string_jump");
5594 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5595 DEBUG_PRINT3(" %d (to 0x%lx):\n", mcnt,
5598 PUSH_FAILURE_POINT(p + mcnt, (unsigned char *)0, -2);
5601 /* Uses of on_failure_jump:
5603 Each alternative starts with an on_failure_jump that
5604 points to the beginning of the next alternative.
5605 Each alternative except the last ends with a jump
5606 that in effect jumps past the rest of the
5607 alternatives. (They really jump to the ending jump
5608 of the following alternative, because tensioning
5609 these jumps is a hassle.)
5611 Repeats start with an on_failure_jump that points
5612 past both the repetition text and either the
5613 following jump or pop_failure_jump back to this
5615 case on_failure_jump:
5617 DEBUG_PRINT1("EXECUTING on_failure_jump");
5619 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5620 DEBUG_PRINT3(" %d (to 0x%lx)", mcnt, (long)(p + mcnt));
5622 /* If this on_failure_jump comes right before a group
5623 (i.e., the original * applied to a group), save the
5624 information for that group and all inner ones, so
5625 that if we fail back to this point, the group's
5626 information will be correct. For example, in
5627 \(a*\)*\1, we need the preceding group, and in
5628 \(\(a*\)b*\)\2, we need the inner group. */
5630 /* We can't use `p' to check ahead because we push
5631 a failure point to `p + mcnt' after we do this. */
5634 /* We need to skip no_op's before we look for the
5635 start_memory in case this on_failure_jump is
5636 happening as the result of a completed succeed_n, as
5637 in \(a\)\{1,3\}b\1 against aba. */
5638 while (p1 < pend && (re_opcode_t) * p1 == no_op)
5641 if (p1 < pend && (re_opcode_t) * p1 == start_memory) {
5642 /* We have a new highest active register now.
5643 This will get reset at the start_memory we
5644 are about to get to, but we will have saved
5645 all the registers relevant to this repetition
5646 op, as described above. */
5647 highest_active_reg = *(p1 + 1) + *(p1 + 2);
5648 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5649 lowest_active_reg = *(p1 + 1);
5652 DEBUG_PRINT1(":\n");
5653 PUSH_FAILURE_POINT(p + mcnt, d, -2);
5656 /* A smart repeat ends with `maybe_pop_jump'.
5657 We change it to either `pop_failure_jump' or `jump'. */
5658 case maybe_pop_jump:
5659 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5660 DEBUG_PRINT2("EXECUTING maybe_pop_jump %d.\n", mcnt);
5662 REGISTER const unsigned char *p2 = p;
5664 /* Compare the beginning of the repeat with what
5665 in the pattern follows its end. If we can
5666 establish that there is nothing that they
5667 would both match, i.e., that we would have to
5668 backtrack because of (as in, e.g., `a*a')
5669 then we can change to pop_failure_jump,
5670 because we'll never have to backtrack.
5672 This is not true in the case of alternatives:
5673 in `(a|ab)*' we do need to backtrack to the
5674 `ab' alternative (e.g., if the string was
5675 `ab'). But instead of trying to detect that
5676 here, the alternative has put on a dummy
5677 failure point which is what we will end up
5680 /* Skip over open/close-group commands. If what
5681 follows this loop is a ...+ construct, look
5682 at what begins its body, since we will have
5683 to match at least one of that. */
5686 && ((re_opcode_t) * p2 ==
5688 || (re_opcode_t) * p2 ==
5691 else if (p2 + 6 < pend
5692 && (re_opcode_t) * p2 ==
5700 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
5701 to the `maybe_finalize_jump' of this case. Examine what
5704 /* If we're at the end of the pattern, we can change. */
5706 /* Consider what happens when matching ":\(.*\)"
5707 against ":/". I don't really understand this code
5709 p[-3] = (unsigned char)pop_failure_jump;
5711 (" End of pattern: change to `pop_failure_jump'.\n");
5714 else if ((re_opcode_t) * p2 == exactn
5715 || (bufp->newline_anchor
5716 && (re_opcode_t) * p2 ==
5718 REGISTER unsigned char c =
5720 (unsigned char)endline ? '\n' :
5723 if ((re_opcode_t) p1[3] == exactn
5729 (" %c != %c => pop_failure_jump.\n",
5733 else if ((re_opcode_t) p1[3] == charset
5734 || (re_opcode_t) p1[3] ==
5737 (re_opcode_t) p1[3] ==
5741 (unsigned char)(p1[4] *
5745 BYTEWIDTH] & (1 << (c
5750 /* `not_p' is equal to 1 if c would match, which means
5751 that we can't change to pop_failure_jump. */
5757 (" No match => pop_failure_jump.\n");
5760 } else if ((re_opcode_t) * p2 == charset) {
5761 #ifdef REGEX_DEBUG_FLAG
5762 REGISTER unsigned char c
5765 (unsigned char)endline ? '\n' :
5769 if ((re_opcode_t) p1[3] == exactn
5770 && !((int)p2[1] * BYTEWIDTH >
5772 && (p2[2 + p1[5] / BYTEWIDTH]
5780 (" %c != %c => pop_failure_jump.\n",
5784 else if ((re_opcode_t) p1[3] ==
5787 /* We win if the charset_not inside the loop
5788 lists every character listed in the charset after. */
5789 for (idx = 0; idx < (int)p2[1];
5807 (" No match => pop_failure_jump.\n");
5809 } else if ((re_opcode_t) p1[3] ==
5812 /* We win if the charset inside the loop
5813 has no overlap with the one after the loop. */
5816 && idx < (int)p1[4]; idx++)
5827 (" No match => pop_failure_jump.\n");
5832 p -= 2; /* Point at relative address again. */
5833 if ((re_opcode_t) p[-1] != pop_failure_jump) {
5834 p[-1] = (unsigned char)jump;
5835 DEBUG_PRINT1(" Match => jump.\n");
5836 goto unconditional_jump;
5838 /* Note fall through. */
5840 /* The end of a simple repeat has a pop_failure_jump
5841 back to its matching on_failure_jump, where the
5842 latter will push a failure point. The
5843 pop_failure_jump takes off failure points put on by
5844 this pop_failure_jump's matching on_failure_jump; we
5845 got through the pattern to here from the matching
5846 on_failure_jump, so didn't fail. */
5847 case pop_failure_jump: {
5848 /* We need to pass separate storage for the
5849 lowest and highest registers, even though we
5850 don't care about the actual values.
5851 Otherwise, we will restore only one register
5852 from the stack, since lowest will == highest
5853 in `pop_failure_point'. */
5854 int dummy_low_reg, dummy_high_reg;
5855 const unsigned char *pdummy = NULL;
5856 const unsigned char *sdummy = NULL;
5858 DEBUG_PRINT1("EXECUTING pop_failure_jump.\n");
5859 POP_FAILURE_POINT(sdummy, pdummy,
5860 dummy_low_reg, dummy_high_reg,
5861 reg_dummy, reg_dummy,
5863 SXE_SET_UNUSED(pdummy), SXE_SET_UNUSED(sdummy);
5865 /* Note fall through. */
5867 /* Unconditionally jump (without popping any failure
5871 /* Get the amount to jump. */
5872 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5873 DEBUG_PRINT2("EXECUTING jump %d ", mcnt);
5874 p += mcnt; /* Do the jump. */
5875 DEBUG_PRINT2("(to 0x%lx).\n", (long)p);
5878 /* We need this opcode so we can detect where alternatives end
5879 in `group_match_null_string_p' et al. */
5881 DEBUG_PRINT1("EXECUTING jump_past_alt.\n");
5882 goto unconditional_jump;
5884 /* Normally, the on_failure_jump pushes a failure point, which
5885 then gets popped at pop_failure_jump. We will end up at
5886 pop_failure_jump, also, and with a pattern of, say, `a+', we
5887 are skipping over the on_failure_jump, so we have to push
5888 something meaningless for pop_failure_jump to pop. */
5889 case dummy_failure_jump:
5890 DEBUG_PRINT1("EXECUTING dummy_failure_jump.\n");
5891 /* It doesn't matter what we push for the string here. What
5892 the code at `fail' tests is the value for the pattern. */
5893 PUSH_FAILURE_POINT(NULL, NULL, -2);
5894 goto unconditional_jump;
5896 /* At the end of an alternative, we need to push a dummy failure
5897 point in case we are followed by a `pop_failure_jump', because
5898 we don't want the failure point for the alternative to be
5899 popped. For example, matching `(a|ab)*' against `aab'
5900 requires that we match the `ab' alternative. */
5901 case push_dummy_failure:
5902 DEBUG_PRINT1("EXECUTING push_dummy_failure.\n");
5903 /* See comments just above at `dummy_failure_jump' about the
5905 PUSH_FAILURE_POINT(NULL, NULL, -2);
5908 /* Have to succeed matching what follows at least n times.
5909 After that, handle like `on_failure_jump'. */
5911 EXTRACT_NUMBER(mcnt, p + 2);
5912 DEBUG_PRINT2("EXECUTING succeed_n %d.\n", mcnt);
5915 /* Originally, this is how many times we HAVE to succeed. */
5919 STORE_NUMBER_AND_INCR(p, mcnt);
5920 DEBUG_PRINT3(" Setting 0x%lx to %d.\n",
5922 } else if (mcnt == 0) {
5924 (" Setting two bytes from 0x%lx to no_op.\n",
5926 p[2] = (unsigned char)no_op;
5927 p[3] = (unsigned char)no_op;
5933 EXTRACT_NUMBER(mcnt, p + 2);
5934 DEBUG_PRINT2("EXECUTING jump_n %d.\n", mcnt);
5936 /* Originally, this is how many times we CAN jump. */
5939 STORE_NUMBER(p + 2, mcnt);
5940 goto unconditional_jump;
5942 /* If don't have to jump any more, skip over the rest of command. */
5949 DEBUG_PRINT1("EXECUTING set_number_at.\n");
5951 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5953 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5954 DEBUG_PRINT3(" Setting 0x%lx to %d.\n",
5956 STORE_NUMBER(p1, mcnt);
5961 DEBUG_PRINT1("EXECUTING wordbound.\n");
5966 /* Straightforward and (I hope) correct implementation.
5967 Probably should be optimized by arranging to compute
5969 /* emch1 is the character before d, syn1 is the syntax of
5970 emch1, emch2 is the character at d, and syn2 is the
5972 Emchar emch1, emch2;
5973 /* GCC isn't smart enough to see these are initialized if used. */
5974 int syn1 = 0, syn2 = 0;
5975 re_char *d_before, *d_after;
5977 at_beg = AT_STRINGS_BEG(d),
5978 at_end = AT_STRINGS_END(d);
5983 if (at_beg && at_end) {
5988 POS_BEFORE_GAP_UNSAFE(d);
5989 DEC_CHARPTR(d_before);
5991 charptr_emchar(d_before);
5994 SYNTAX_CACHE_BYTE_TO_CHAR
5995 (PTR_TO_OFFSET(d)) - 1;
5996 UPDATE_SYNTAX_CACHE(xpos);
5998 syn1 = SYNTAX_FROM_CACHE
6000 (regex_emacs_buffer->
6001 mirror_syntax_table),
6006 POS_AFTER_GAP_UNSAFE(d);
6007 emch2 = charptr_emchar(d_after);
6010 SYNTAX_CACHE_BYTE_TO_CHAR
6012 UPDATE_SYNTAX_CACHE_FORWARD(xpos
6016 syn2 = SYNTAX_FROM_CACHE
6018 (regex_emacs_buffer->
6019 mirror_syntax_table),
6024 result = (syn2 == Sword);
6026 result = (syn1 == Sword);
6033 if (result == should_succeed)
6039 DEBUG_PRINT1("EXECUTING notwordbound.\n");
6041 goto matchwordbound;
6044 DEBUG_PRINT1("EXECUTING wordbeg.\n");
6045 if (AT_STRINGS_END(d))
6048 /* XEmacs: this originally read:
6050 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
6054 re_char *dtmp = POS_AFTER_GAP_UNSAFE(d);
6055 Emchar emch = charptr_emchar(dtmp);
6058 SYNTAX_CACHE_BYTE_TO_CHAR(PTR_TO_OFFSET(d));
6059 UPDATE_SYNTAX_CACHE(charpos);
6061 if (SYNTAX_FROM_CACHE
6063 (regex_emacs_buffer->mirror_syntax_table),
6066 if (AT_STRINGS_BEG(d))
6068 dtmp = POS_BEFORE_GAP_UNSAFE(d);
6070 emch = charptr_emchar(dtmp);
6072 UPDATE_SYNTAX_CACHE_BACKWARD(charpos - 1);
6074 if (SYNTAX_FROM_CACHE
6076 (regex_emacs_buffer->mirror_syntax_table),
6083 DEBUG_PRINT1("EXECUTING wordend.\n");
6084 if (AT_STRINGS_BEG(d))
6087 /* XEmacs: this originally read:
6089 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
6090 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
6093 The or condition is incorrect (reversed).
6099 SYNTAX_CACHE_BYTE_TO_CHAR(PTR_TO_OFFSET(d))
6101 UPDATE_SYNTAX_CACHE(charpos);
6103 dtmp = POS_BEFORE_GAP_UNSAFE(d);
6105 emch = charptr_emchar(dtmp);
6106 if (SYNTAX_FROM_CACHE
6108 (regex_emacs_buffer->mirror_syntax_table),
6111 if (AT_STRINGS_END(d))
6113 dtmp = POS_AFTER_GAP_UNSAFE(d);
6114 emch = charptr_emchar(dtmp);
6116 UPDATE_SYNTAX_CACHE_FORWARD(charpos + 1);
6118 if (SYNTAX_FROM_CACHE
6120 (regex_emacs_buffer->mirror_syntax_table),
6128 DEBUG_PRINT1("EXECUTING before_dot.\n");
6130 (NILP(regex_match_object)
6131 || BUFFERP(regex_match_object))
6132 || (BUF_PTR_BYTE_POS(regex_emacs_buffer, d)
6133 >= BUF_PT(regex_emacs_buffer)))
6138 DEBUG_PRINT1("EXECUTING at_dot.\n");
6140 (NILP(regex_match_object)
6141 || BUFFERP(regex_match_object))
6142 || (BUF_PTR_BYTE_POS(regex_emacs_buffer, d)
6143 != BUF_PT(regex_emacs_buffer)))
6148 DEBUG_PRINT1("EXECUTING after_dot.\n");
6150 (NILP(regex_match_object)
6151 || BUFFERP(regex_match_object))
6152 || (BUF_PTR_BYTE_POS(regex_emacs_buffer, d)
6153 <= BUF_PT(regex_emacs_buffer)))
6156 #if 0 /* not emacs19 */
6158 DEBUG_PRINT1("EXECUTING at_dot.\n");
6159 if (BUF_PTR_BYTE_POS
6160 (regex_emacs_buffer,
6161 (unsigned char *)d) + 1 !=
6162 BUF_PT(regex_emacs_buffer))
6165 #endif /* not emacs19 */
6168 DEBUG_PRINT2("EXECUTING syntaxspec %d.\n", mcnt);
6173 DEBUG_PRINT1("EXECUTING Emacs wordchar.\n");
6186 SYNTAX_CACHE_BYTE_TO_CHAR
6188 UPDATE_SYNTAX_CACHE(charpos);
6192 emch = charptr_emchar((const Bufbyte *)d);
6196 (regex_emacs_buffer->mirror_syntax_table),
6197 emch) == (enum syntaxcode)mcnt);
6199 if (matches != should_succeed)
6206 DEBUG_PRINT2("EXECUTING notsyntaxspec %d.\n", mcnt);
6208 goto matchnotsyntax;
6211 DEBUG_PRINT1("EXECUTING Emacs notwordchar.\n");
6215 goto matchornotsyntax;
6218 /* 97/2/17 jhod Mule category code patch */
6227 emch = charptr_emchar((const Bufbyte *)d);
6229 if (check_category_char
6230 (emch, regex_emacs_buffer->category_table,
6231 mcnt, should_succeed))
6237 case notcategoryspec:
6239 goto matchornotcategory;
6240 /* end of category patch */
6242 #else /* not emacs */
6244 DEBUG_PRINT1("EXECUTING non-Emacs wordchar.\n");
6246 if (!WORDCHAR_P_UNSAFE((int)(*d)))
6253 DEBUG_PRINT1("EXECUTING non-Emacs notwordchar.\n");
6255 if (!WORDCHAR_P_UNSAFE((int)(*d)))
6265 /* Successfully executed one pattern command; keep going. */
6268 /* We goto here if a matching operation fails. */
6270 if (!FAIL_STACK_EMPTY()) {
6271 /* A restart point is known. Restore to that state. */
6272 DEBUG_PRINT1("\nFAIL:\n");
6274 const unsigned char* cpat = p;
6276 POP_FAILURE_POINT(d, cpat, lowest_active_reg,
6277 highest_active_reg, regstart, regend,
6279 #pragma GCC diagnostic push
6280 #pragma GCC diagnostic ignored "-Wcast-qual"
6281 p = (unsigned char*)cpat;
6282 #pragma GCC diagnostic pop
6284 /* If this failure point is a dummy, try the next one. */
6288 /* If we failed to the end of the pattern, don't examine
6292 re_bool is_a_jump_n = false;
6294 /* If failed to a backwards jump that's part of
6295 a repetition loop, need to pop this failure
6296 point and use the next one. */
6297 switch ((unsigned int)(re_opcode_t)*p) {
6300 case maybe_pop_jump:
6301 case pop_failure_jump:
6304 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6308 && (re_opcode_t) * p1 == succeed_n)
6310 && (re_opcode_t) * p1 ==
6319 if (d >= string1 && d <= end1)
6322 break; /* Matching at this starting point really fails. */
6326 goto restore_best_regs;
6330 return -1; /* Failure to match. */
6334 /* Subroutine definitions for re_match_2. */
6336 /* We are passed P pointing to a register number after a start_memory.
6338 Return true if the pattern up to the corresponding stop_memory can
6339 match the empty string, and false otherwise.
6341 If we find the matching stop_memory, sets P to point to one past its number.
6342 Otherwise, sets P to an undefined byte less than or equal to END.
6344 We don't handle duplicates properly (yet). */
6347 group_match_null_string_p(unsigned char **p, unsigned char *end,
6348 register_info_type * register_info)
6351 /* Point to after the args to the start_memory. */
6352 unsigned char *p1 = *p + 2;
6355 /* Skip over opcodes that can match nothing, and return true or
6356 false, as appropriate, when we get to one that can't, or to the
6357 matching stop_memory. */
6359 switch ((unsigned int)(re_opcode_t)*p1) {
6360 /* Could be either a loop or a series of alternatives. */
6361 case on_failure_jump:
6363 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6365 /* If the next operation is not a jump backwards in the
6369 /* Go through the on_failure_jumps of the alternatives,
6370 seeing if any of the alternatives cannot match nothing.
6371 The last alternative starts with only a jump,
6372 whereas the rest start with on_failure_jump and end
6373 with a jump, e.g., here is the pattern for `a|b|c':
6375 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
6376 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
6379 So, we have to first go through the first (n-1)
6380 alternatives and then deal with the last one separately. */
6382 /* Deal with the first (n-1) alternatives, which start
6383 with an on_failure_jump (see above) that jumps to right
6384 past a jump_past_alt. */
6386 while ((re_opcode_t) p1[mcnt - 3] ==
6388 /* `mcnt' holds how many bytes long the alternative
6389 is, including the ending `jump_past_alt' and
6392 if (!alt_match_null_string_p
6393 (p1, p1 + mcnt - 3, register_info))
6396 /* Move to right after this alternative, including the
6400 /* Break if it's the beginning of an n-th alternative
6401 that doesn't begin with an on_failure_jump. */
6402 if ((re_opcode_t) * p1 !=
6406 /* Still have to check that it's not an n-th
6407 alternative that starts with an on_failure_jump. */
6409 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6410 if ((re_opcode_t) p1[mcnt - 3] !=
6412 /* Get to the beginning of the n-th alternative. */
6418 /* Deal with the last alternative: go back and get number
6419 of the `jump_past_alt' just before it. `mcnt' contains
6420 the length of the alternative. */
6421 EXTRACT_NUMBER(mcnt, p1 - 2);
6423 if (!alt_match_null_string_p
6424 (p1, p1 + mcnt, register_info))
6427 p1 += mcnt; /* Get past the n-th alternative. */
6432 assert(p1[1] == **p);
6437 if (!common_op_match_null_string_p(&p1, end, register_info))
6440 } /* while p1 < end */
6443 } /* group_match_null_string_p */
6445 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
6446 It expects P to be the first byte of a single alternative and END one
6447 byte past the last. The alternative can contain groups. */
6450 alt_match_null_string_p(unsigned char *p, unsigned char *end,
6451 register_info_type * register_info)
6454 unsigned char *p1 = p;
6457 /* Skip over opcodes that can match nothing, and break when we get
6458 to one that can't. */
6460 switch ((unsigned int)(re_opcode_t)*p1) {
6462 case on_failure_jump:
6464 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6469 if (!common_op_match_null_string_p(&p1, end, register_info))
6472 } /* while p1 < end */
6475 } /* alt_match_null_string_p */
6477 /* Deals with the ops common to group_match_null_string_p and
6478 alt_match_null_string_p.
6480 Sets P to one after the op and its arguments, if any. */
6483 common_op_match_null_string_p(unsigned char **p, unsigned char *end,
6484 register_info_type * register_info)
6489 unsigned char *p1 = *p;
6491 switch ((unsigned int)(re_opcode_t)*p1++) {
6510 assert(reg_no > 0 && reg_no <= MAX_REGNUM);
6511 ret = group_match_null_string_p(&p1, end, register_info);
6513 /* Have to set this here in case we're checking a group which
6514 contains a group and a back reference to it. */
6516 if (REG_MATCH_NULL_STRING_P(register_info[reg_no]) ==
6517 MATCH_NULL_UNSET_VALUE)
6518 REG_MATCH_NULL_STRING_P(register_info[reg_no]) = ret;
6524 /* If this is an optimized succeed_n for zero times, make the jump. */
6526 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6534 /* Get to the number of times to succeed. */
6536 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6540 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6547 if (!REG_MATCH_NULL_STRING_P(register_info[*p1]))
6556 /* All other opcodes mean we cannot match the empty string. */
6562 } /* common_op_match_null_string_p */
6564 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6565 bytes; nonzero otherwise. */
6568 bcmp_translate(re_char *s1, re_char *s2,
6569 REGISTER int len, RE_TRANSLATE_TYPE translate)
6571 REGISTER const unsigned char *p1 = s1, *p2 = s2;
6573 const unsigned char *p1_end = s1 + len;
6574 const unsigned char *p2_end = s2 + len;
6576 while (p1 != p1_end && p2 != p2_end) {
6577 Emchar p1_ch, p2_ch;
6579 p1_ch = charptr_emchar(p1);
6580 p2_ch = charptr_emchar(p2);
6582 if (RE_TRANSLATE(p1_ch)
6583 != RE_TRANSLATE(p2_ch))
6588 #else /* not MULE */
6590 if (RE_TRANSLATE(*p1++) != RE_TRANSLATE(*p2++))
6598 /* Entry points for GNU code. */
6600 /* re_compile_pattern is the GNU regular expression compiler: it
6601 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6602 Returns 0 if the pattern was valid, otherwise an error string.
6604 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6605 are set in BUFP on entry.
6607 We call regex_compile to do the actual compilation. */
6609 const char *re_compile_pattern(const char *pattern, int length,
6610 struct re_pattern_buffer *bufp)
6614 /* GNU code is written to assume at least RE_NREGS registers will be set
6615 (and at least one extra will be -1). */
6616 bufp->regs_allocated = REGS_UNALLOCATED;
6618 /* And GNU code determines whether or not to get register information
6619 by passing null for the REGS argument to re_match, etc., not by
6623 /* Match anchors at newline. */
6624 bufp->newline_anchor = 1;
6626 ret = regex_compile((const unsigned char*)pattern,
6627 length, re_syntax_options, bufp);
6631 return gettext(re_error_msgid[(int)ret]);
6634 /* Entry points compatible with 4.2 BSD regex library. We don't define
6635 them unless specifically requested. */
6637 #ifdef _REGEX_RE_COMP
6639 /* BSD has one and only one pattern buffer. */
6640 static struct re_pattern_buffer re_comp_buf;
6642 char *re_comp(const char *s)
6647 if (!re_comp_buf.buffer)
6648 return gettext("No previous regular expression");
6652 if (!re_comp_buf.buffer) {
6653 re_comp_buf.buffer = (unsigned char *)xmalloc_atomic(200);
6654 if (re_comp_buf.buffer == NULL)
6655 return gettext(re_error_msgid[(int)REG_ESPACE]);
6656 re_comp_buf.allocated = 200;
6658 re_comp_buf.fastmap = (char *)xmalloc_atomic(1 << BYTEWIDTH);
6659 if (re_comp_buf.fastmap == NULL)
6660 return gettext(re_error_msgid[(int)REG_ESPACE]);
6663 /* Since `re_exec' always passes NULL for the `regs' argument, we
6664 don't need to initialize the pattern buffer fields which affect it. */
6666 /* Match anchors at newlines. */
6667 re_comp_buf.newline_anchor = 1;
6670 regex_compile((unsigned char *)s, strlen(s), re_syntax_options,
6676 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6677 return (char *)gettext(re_error_msgid[(int)ret]);
6680 int re_exec(const char *s)
6682 const int len = strlen(s);
6684 0 <= re_search(&re_comp_buf, s, len, 0, len,
6685 (struct re_registers *)0);
6687 #endif /* _REGEX_RE_COMP */
6689 /* POSIX.2 functions. Don't define these for Emacs. */
6693 /* regcomp takes a regular expression as a string and compiles it.
6695 PREG is a regex_t *. We do not expect any fields to be initialized,
6696 since POSIX says we shouldn't. Thus, we set
6698 `buffer' to the compiled pattern;
6699 `used' to the length of the compiled pattern;
6700 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6701 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6702 RE_SYNTAX_POSIX_BASIC;
6703 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6704 `fastmap' and `fastmap_accurate' to zero;
6705 `re_nsub' to the number of subexpressions in PATTERN.
6706 (non-shy of course. POSIX probably doesn't know about
6707 shy ones, and in any case they should be invisible.)
6709 PATTERN is the address of the pattern string.
6711 CFLAGS is a series of bits which affect compilation.
6713 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6714 use POSIX basic syntax.
6716 If REG_NEWLINE is set, then . and [^...] don't match newline.
6717 Also, regexec will try a match beginning after every newline.
6719 If REG_ICASE is set, then we considers upper- and lowercase
6720 versions of letters to be equivalent when matching.
6722 If REG_NOSUB is set, then when PREG is passed to regexec, that
6723 routine will report only success or failure, and nothing about the
6726 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6727 the return codes and their meanings.) */
6729 int regcomp(regex_t * preg, const char *pattern, int cflags)
6733 = (cflags & REG_EXTENDED) ?
6734 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6736 /* regex_compile will allocate the space for the compiled pattern. */
6738 preg->allocated = 0;
6741 /* Don't bother to use a fastmap when searching. This simplifies the
6742 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
6743 characters after newlines into the fastmap. This way, we just try
6747 if (cflags & REG_ICASE) {
6750 preg->translate = (char *)xmalloc_atomic(CHAR_SET_SIZE);
6751 if (preg->translate == NULL)
6752 return (int)REG_ESPACE;
6754 /* Map uppercase characters to corresponding lowercase ones. */
6755 for (i = 0; i < CHAR_SET_SIZE; i++)
6756 preg->translate[i] = ISUPPER(i) ? tolower(i) : i;
6758 preg->translate = NULL;
6760 /* If REG_NEWLINE is set, newlines are treated differently. */
6761 if (cflags & REG_NEWLINE) {
6762 /* REG_NEWLINE implies neither . nor [^...] match newline. */
6763 syntax &= ~RE_DOT_NEWLINE;
6764 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6765 /* It also changes the matching behavior. */
6766 preg->newline_anchor = 1;
6768 preg->newline_anchor = 0;
6770 preg->no_sub = !!(cflags & REG_NOSUB);
6772 /* POSIX says a null character in the pattern terminates it, so we
6773 can use strlen here in compiling the pattern. */
6774 ret = regex_compile((const unsigned char*)pattern,
6775 strlen(pattern), syntax, preg);
6777 /* POSIX doesn't distinguish between an unmatched open-group and an
6778 unmatched close-group: both are REG_EPAREN. */
6779 if (ret == REG_ERPAREN)
6785 /* regexec searches for a given pattern, specified by PREG, in the
6788 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6789 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6790 least NMATCH elements, and we set them to the offsets of the
6791 corresponding matched substrings.
6793 EFLAGS specifies `execution flags' which affect matching: if
6794 REG_NOTBOL is set, then ^ does not match at the beginning of the
6795 string; if REG_NOTEOL is set, then $ does not match at the end.
6797 We return 0 if we find a match and REG_NOMATCH if not. */
6800 regexec(const regex_t * preg, const char *string, Element_count nmatch,
6801 regmatch_t pmatch[], int eflags)
6804 struct re_registers regs;
6805 regex_t private_preg;
6806 int len = strlen(string);
6807 re_bool want_reg_info = !preg->no_sub && nmatch > 0;
6809 private_preg = *preg;
6811 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6812 private_preg.not_eol = !!(eflags & REG_NOTEOL);
6814 /* The user has told us exactly how many registers to return
6815 information about, via `nmatch'. We have to pass that on to the
6816 matching routines. */
6817 private_preg.regs_allocated = REGS_FIXED;
6819 if (want_reg_info) {
6820 regs.num_regs = nmatch;
6821 regs.start = TALLOC(nmatch, regoff_t);
6822 regs.end = TALLOC(nmatch, regoff_t);
6823 if (regs.start == NULL || regs.end == NULL)
6824 return (int)REG_NOMATCH;
6827 /* Perform the searching operation. */
6828 ret = re_search(&private_preg, string, len,
6829 /* start: */ 0, /* range: */ len,
6830 want_reg_info ? ®s : (struct re_registers *)0);
6832 /* Copy the register information to the POSIX structure. */
6833 if (want_reg_info) {
6837 for (r = 0; r < nmatch; r++) {
6838 pmatch[r].rm_so = regs.start[r];
6839 pmatch[r].rm_eo = regs.end[r];
6843 /* If we needed the temporary register info, free the space now. */
6848 /* We want zero return to mean success, unlike `re_search'. */
6849 return ret >= 0 ? (int)REG_NOERROR : (int)REG_NOMATCH;
6852 /* Returns a message corresponding to an error code, ERRCODE, returned
6853 from either regcomp or regexec. We don't use PREG here. */
6856 regerror(int errcode, const regex_t * preg, char *errbuf,
6857 Memory_count errbuf_size)
6860 Memory_count msg_size;
6862 if (errcode < 0 || (size_t) errcode >= (sizeof(re_error_msgid)
6863 / sizeof(re_error_msgid[0])))
6864 /* Only error codes returned by the rest of the code should be passed
6865 to this routine. If we are given anything else, or if other regex
6866 code generates an invalid error code, then the program has a bug.
6867 Dump core so we can fix it. */
6870 msg = gettext(re_error_msgid[errcode]);
6872 msg_size = strlen(msg) + 1; /* Includes the null. */
6874 if (errbuf_size != 0) {
6875 strncpy(errbuf, msg, errbuf_size - 1);
6876 errbuf[errbuf_size-1]='\0';
6882 /* Free dynamically allocated space used by PREG. */
6884 void regfree(regex_t * preg)
6886 if (preg->buffer != NULL) {
6887 xfree(preg->buffer);
6889 preg->buffer = NULL;
6891 preg->allocated = 0;
6894 if (preg->fastmap != NULL) {
6895 xfree(preg->fastmap);
6897 preg->fastmap = NULL;
6898 preg->fastmap_accurate = 0;
6900 if (preg->translate != NULL) {
6901 xfree(preg->translate);
6903 preg->translate = NULL;
6906 #endif /* not emacs */
6910 make-backup-files: t
6912 trim-versions-without-asking: nil