1 /* Extended regular expression matching and search library,
2 version 0.12, extended for XEmacs.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
6 Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
7 Copyright (C) 1995 Sun Microsystems, Inc.
8 Copyright (C) 1995 Ben Wing.
10 This file is part of SXEmacs
12 SXEmacs is free software: you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation, either version 3 of the License, or
15 (at your option) any later version.
17 SXEmacs is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program. If not, see <http://www.gnu.org/licenses/>. */
26 /* Synched up with: FSF 19.29. */
28 /* Changes made for XEmacs:
30 (1) the REGEX_BEGLINE_CHECK code from the XEmacs v18 regex routines
31 was added. This causes a huge speedup in font-locking.
32 (2) Rel-alloc is disabled when the MMAP version of rel-alloc is
33 being used, because it's too slow -- all those calls to mmap()
34 add humongous overhead.
35 (3) Lots and lots of changes for Mule. They are bracketed by
36 `#ifdef MULE' or with comments that have `XEmacs' in them.
43 #ifndef REGISTER /* Rigidly enforced as of 20.3 */
52 /* Converts the pointer to the char to BEG-based offset from the start. */
53 #define PTR_TO_OFFSET(d) (MATCHING_IN_FIRST_STRING \
54 ? (d) - string1 : (d) - (string2 - size1))
56 #define PTR_TO_OFFSET(d) 0
59 /* We assume non-Mule if emacs isn't defined. */
64 /* We need this for `regex.h', and perhaps for the Emacs include files. */
65 #include <sys/types.h>
67 /* This is for other GNU distributions with internationalized messages. */
68 #if defined (I18N3) && (defined (HAVE_LIBINTL_H) || defined (_LIBC))
71 # define gettext(msgid) (msgid)
74 /* XEmacs: define this to add in a speedup for patterns anchored at
75 the beginning of a line. Keep the ifdefs so that it's easier to
76 tell where/why this code has diverged from v19. */
77 #define REGEX_BEGLINE_CHECK
79 /* XEmacs: the current mmap-based ralloc handles small blocks very
80 poorly, so we disable it here. */
82 #if (defined (REL_ALLOC) && defined (HAVE_MMAP)) || defined(DOUG_LEA_MALLOC)
86 #include "sxe-utils.h"
88 /* The `emacs' switch turns on certain matching commands
89 that make sense only in Emacs. */
98 Lisp_Object Vthe_lisp_rangetab;
100 void complex_vars_of_regex(void)
102 Vthe_lisp_rangetab = Fmake_range_table();
103 staticpro(&Vthe_lisp_rangetab);
108 void complex_vars_of_regex(void)
114 #define RE_TRANSLATE(ch) TRT_TABLE_OF (translate, (Emchar) ch)
115 #define TRANSLATE_P(tr) (!NILP (tr) && CHAR_TABLEP(tr))
117 /* just massage that xfree thing */
119 #if defined HAVE_BDWGC && defined EF_USE_BDWGC
127 /* If we are not linking with Emacs proper,
128 we can't use the relocating allocator
129 even if config.h says that we can. */
132 #if defined (STDC_HEADERS) || defined (_LIBC)
134 # include <stdbool.h>
140 /* Types normally included via lisp.h */
141 #include <stddef.h> /* for ptrdiff_t */
144 #ifndef DECLARE_NOTHING
145 #define DECLARE_NOTHING struct nosuchstruct
151 #define charptr_emchar(str) ((Emchar) (str)[0])
153 #define INC_CHARPTR(p) ((p)++)
154 #define DEC_CHARPTR(p) ((p)--)
158 /* Define the syntax stuff for \<, \>, etc. */
160 /* This must be nonzero for the wordchar and notwordchar pattern
161 commands in re_match_2. */
168 extern char *re_syntax_table;
170 #else /* not SYNTAX_TABLE */
172 /* How many characters in the character set. */
173 #define CHAR_SET_SIZE 256
175 static char re_syntax_table[CHAR_SET_SIZE];
177 static void init_syntax_once(void)
182 const char *word_syntax_chars =
183 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
185 memset(re_syntax_table, 0, sizeof(re_syntax_table));
187 while (*word_syntax_chars)
188 re_syntax_table[(unsigned int)(*word_syntax_chars++)] =
195 #endif /* SYNTAX_TABLE */
197 #define SYNTAX_UNSAFE(ignored, c) re_syntax_table[c]
198 #undef SYNTAX_FROM_CACHE
199 #define SYNTAX_FROM_CACHE SYNTAX_UNSAFE
201 #define RE_TRANSLATE(c) translate[(unsigned char) (c)]
202 #define TRANSLATE_P(tr) tr
204 #if defined HAVE_BDWGC && defined EF_USE_BDWGC
205 # if defined HAVE_GC_GC_H
207 # elif defined HAVE_GC_H
210 # define xmalloc GC_MALLOC
211 # define xmalloc_atomic GC_MALLOC_ATOMIC
212 # define xrealloc GC_REALLOC
213 # define xfree GC_FREE
214 # define xstrdup GC_STRDUP
215 #else /* !HAVE_BDWGC */
216 # define xmalloc malloc
217 # define xmalloc_atomic malloc
218 # define xrealloc realloc
220 # define xstrdup strdup
221 #endif /* HAVE_BDWGC */
226 /* Under XEmacs, this is needed because we don't define it elsewhere. */
227 #ifdef SWITCH_ENUM_BUG
228 #define SWITCH_ENUM_CAST(x) ((int)(x))
230 #define SWITCH_ENUM_CAST(x) (x)
233 /* Get the interface, including the syntax bits. */
236 /* isalpha etc. are used for the character classes. */
239 /* Jim Meyering writes:
241 "... Some ctype macros are valid only for character codes that
242 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
243 using /bin/cc or gcc but without giving an ansi option). So, all
244 ctype uses should be through macros like ISPRINT... If
245 STDC_HEADERS is defined, then autoconf has verified that the ctype
246 macros don't need to be guarded with references to isascii. ...
247 Defining isascii to 1 should let any compiler worth its salt
248 eliminate the && through constant folding." */
250 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
251 #define ISASCII_1(c) 1
253 #define ISASCII_1(c) isascii(c)
257 /* The IS*() macros can be passed any character, including an extended
258 one. We need to make sure there are no crashes, which would occur
259 otherwise due to out-of-bounds array references. */
260 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c))
262 #define ISASCII(c) ISASCII_1 (c)
266 #define ISBLANK(c) (ISASCII (c) && isblank (c))
268 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
271 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
273 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
276 /* stolen from FSF/src/charset.h */
277 #define SINGLE_BYTE_CHAR_P(c) (((unsigned int)(c) & 0xFF) == (c))
278 /* 1 if C is an ASCII character. */
279 #define IS_REAL_ASCII(c) ((c) < 128)
280 /* 1 if C is a unibyte character. */
281 #define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P(c))
283 #define ISPRINT(c) (ISASCII (c) && isprint (c))
284 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
285 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
286 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
287 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
288 #define ISLOWER(c) (ISASCII (c) && islower (c))
289 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
290 #define ISSPACE(c) (ISASCII (c) && isspace (c))
291 #define ISUPPER(c) (ISASCII (c) && isupper (c))
292 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
297 __attribute__((always_inline));
301 Lisp_Object ct = regex_emacs_buffer->mirror_syntax_table;
303 return (SYNTAX(XCHAR_TABLE(ct), c) == Sword);
306 # define ISWORD(c) (ISALPHA(c))
310 #define NULL (void *)0
313 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
314 since ours (we hope) works properly with all combinations of
315 machines, compilers, `char' and `unsigned char' argument types.
316 (Per Bothner suggested the basic approach.) */
317 #undef SIGN_EXTEND_CHAR
319 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
320 #else /* not __STDC__ */
321 /* As in Harbison and Steele. */
322 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
325 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
326 use `alloca' instead of `malloc'. This is because using malloc in
327 re_search* or re_match* could cause memory leaks when C-g is used in
328 Emacs; also, malloc is slower and causes storage fragmentation. On
329 the other hand, malloc is more portable, and easier to debug.
331 Because we sometimes use alloca, some routines have to be macros,
332 not functions -- `alloca'-allocated space disappears at the end of the
333 function it is called in. */
337 #define REGEX_ALLOCATE xmalloc
338 #define REGEX_REALLOCATE(source, osize, nsize) xrealloc(source, nsize)
339 #define REGEX_FREE xfree
341 #else /* !REGEX_MALLOC */
343 /* Emacs already defines alloca, sometimes. */
346 /* Make alloca work the best possible way. */
348 #define alloca __builtin_alloca
349 #else /* not __GNUC__ */
352 #else /* not __GNUC__ or HAVE_ALLOCA_H */
353 #ifndef _AIX /* Already did AIX, up at the top. */
355 #endif /* not _AIX */
356 #endif /* HAVE_ALLOCA_H */
357 #endif /* __GNUC__ */
359 #endif /* not alloca */
361 #define REGEX_ALLOCATE alloca
363 /* Assumes a `char *destination' variable. */
364 #define REGEX_REALLOCATE(source, osize, nsize) \
365 (destination = (char *) alloca (nsize), \
366 memmove (destination, source, osize), \
369 /* No need to do anything to free, after alloca. */
370 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
372 #endif /* REGEX_MALLOC */
374 /* Define how to allocate the failure stack. */
378 #define REGEX_ALLOCATE_STACK(size) \
379 r_alloc ((char **) &failure_stack_ptr, (size))
380 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
381 r_re_alloc ((char **) &failure_stack_ptr, (nsize))
382 #define REGEX_FREE_STACK(ptr) \
383 r_alloc_free ((void **) &failure_stack_ptr)
385 #else /* !REL_ALLOC */
389 #define REGEX_ALLOCATE_STACK xmalloc
390 #define REGEX_REALLOCATE_STACK(source, osize, nsize) xrealloc(source, nsize)
391 #define REGEX_FREE_STACK xfree
393 #else /* !REGEX_MALLOC */
395 #define REGEX_ALLOCATE_STACK alloca
397 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
398 REGEX_REALLOCATE (source, osize, nsize)
399 /* No need to explicitly free anything. */
400 #define REGEX_FREE_STACK(arg)
402 #endif /* REGEX_MALLOC */
403 #endif /* REL_ALLOC */
405 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
406 `string1' or just past its end. This works if PTR is NULL, which is
408 #define FIRST_STRING_P(ptr) \
409 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
411 /* (Re)Allocate N items of type T using malloc, or fail. */
412 #define TALLOC(n, t) \
413 ((t *)xmalloc((n) * sizeof (t)))
414 #define TALLOC_ATOMIC(n, t) \
415 ((t *)xmalloc_atomic((n) * sizeof (t)))
416 #define RETALLOC(addr, n, t) \
417 ((addr) = (t *)xrealloc(addr, (n) * sizeof (t)))
418 #define RETALLOC_IF(addr, n, t) \
420 RETALLOC((addr), (n), t); \
422 (addr) = TALLOC ((n), t)
423 #define REGEX_TALLOC(n, t) \
424 ((t *)REGEX_ALLOCATE((n) * sizeof (t)))
426 #define BYTEWIDTH 8 /* In bits. */
428 #define STREQ(s1, s2) (strcmp (s1, s2) == 0)
432 #define MAX(a, b) ((a) > (b) ? (a) : (b))
433 #define MIN(a, b) ((a) < (b) ? (a) : (b))
435 /* Type of source-pattern and string chars. */
436 typedef const unsigned char re_char;
438 typedef char re_bool;
442 /* These are the command codes that appear in compiled regular
443 expressions. Some opcodes are followed by argument bytes. A
444 command code can specify any interpretation whatsoever for its
445 arguments. Zero bytes may appear in the compiled regular expression. */
450 /* Succeed right away--no more backtracking. */
453 /* Followed by one byte giving n, then by n literal bytes. */
456 /* Matches any (more or less) character. */
459 /* Matches any one char belonging to specified set. First
460 following byte is number of bitmap bytes. Then come bytes
461 for a bitmap saying which chars are in. Bits in each byte
462 are ordered low-bit-first. A character is in the set if its
463 bit is 1. A character too large to have a bit in the map is
464 automatically not in the set. */
467 /* Same parameters as charset, but match any character that is
468 not one of those specified. */
471 /* Start remembering the text that is matched, for storing in a
472 register. Followed by one byte with the register number, in
473 the range 1 to the pattern buffer's re_ngroups
474 field. Then followed by one byte with the number of groups
475 inner to this one. (This last has to be part of the
476 start_memory only because we need it in the on_failure_jump
480 /* Stop remembering the text that is matched and store it in a
481 memory register. Followed by one byte with the register
482 number, in the range 1 to `re_ngroups' in the
483 pattern buffer, and one byte with the number of inner groups,
484 just like `start_memory'. (We need the number of inner
485 groups here because we don't have any easy way of finding the
486 corresponding start_memory when we're at a stop_memory.) */
489 /* Match a duplicate of something remembered. Followed by one
490 byte containing the register number. */
493 /* Fail unless at beginning of line. */
496 /* Fail unless at end of line. */
499 /* Succeeds if at beginning of buffer (if emacs) or at beginning
500 of string to be matched (if not). */
503 /* Analogously, for end of buffer/string. */
506 /* Followed by two byte relative address to which to jump. */
509 /* Same as jump, but marks the end of an alternative. */
512 /* Followed by two-byte relative address of place to resume at
513 in case of failure. */
516 /* Like on_failure_jump, but pushes a placeholder instead of the
517 current string position when executed. */
518 on_failure_keep_string_jump,
520 /* Throw away latest failure point and then jump to following
521 two-byte relative address. */
524 /* Change to pop_failure_jump if know won't have to backtrack to
525 match; otherwise change to jump. This is used to jump
526 back to the beginning of a repeat. If what follows this jump
527 clearly won't match what the repeat does, such that we can be
528 sure that there is no use backtracking out of repetitions
529 already matched, then we change it to a pop_failure_jump.
530 Followed by two-byte address. */
533 /* Jump to following two-byte address, and push a dummy failure
534 point. This failure point will be thrown away if an attempt
535 is made to use it for a failure. A `+' construct makes this
536 before the first repeat. Also used as an intermediary kind
537 of jump when compiling an alternative. */
540 /* Push a dummy failure point and continue. Used at the end of
544 /* Followed by two-byte relative address and two-byte number n.
545 After matching N times, jump to the address upon failure. */
548 /* Followed by two-byte relative address, and two-byte number n.
549 Jump to the address N times, then fail. */
552 /* Set the following two-byte relative address to the
553 subsequent two-byte number. The address *includes* the two
557 wordchar, /* Matches any word-constituent character. */
558 notwordchar, /* Matches any char that is not a word-constituent. */
560 wordbeg, /* Succeeds if at word beginning. */
561 wordend, /* Succeeds if at word end. */
563 wordbound, /* Succeeds if at a word boundary. */
564 notwordbound /* Succeeds if not at a word boundary. */
566 , before_dot, /* Succeeds if before point. */
567 at_dot, /* Succeeds if at point. */
568 after_dot, /* Succeeds if after point. */
570 /* Matches any character whose syntax is specified. Followed by
571 a byte which contains a syntax code, e.g., Sword. */
574 /* Matches any character whose syntax is not that specified. */
578 /* need extra stuff to be able to properly work with XEmacs/Mule
579 characters (which may take up more than one byte) */
580 , charset_mule, /* Matches any character belonging to specified set.
581 The set is stored in "unified range-table
582 format"; see rangetab.c. Unlike the `charset'
583 opcode, this can handle arbitrary characters. */
585 charset_mule_not /* Same parameters as charset_mule, but match any
586 character that is not one of those specified. */
587 /* 97/2/17 jhod: The following two were merged back in from the Mule
588 2.3 code to enable some language specific processing */
589 , categoryspec, /* Matches entries in the character category tables */
590 notcategoryspec /* The opposite of the above */
594 /* Common operations on the compiled pattern. */
596 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
598 #define STORE_NUMBER(destination, number) \
600 (destination)[0] = (number) & 0377; \
601 (destination)[1] = (number) >> 8; \
604 /* Same as STORE_NUMBER, except increment DESTINATION to
605 the byte after where the number is stored. Therefore, DESTINATION
606 must be an lvalue. */
608 #define STORE_NUMBER_AND_INCR(destination, number) \
610 STORE_NUMBER (destination, number); \
611 (destination) += 2; \
614 /* Put into DESTINATION a number stored in two contiguous bytes starting
617 #define EXTRACT_NUMBER(destination, source) \
619 (destination) = *(source) & 0377; \
620 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
623 #ifdef REGEX_DEBUG_FLAG
624 static void extract_number(int *dest, re_char * source)
626 int temp = SIGN_EXTEND_CHAR(*(source + 1));
627 *dest = *source & 0377;
631 #ifndef EXTRACT_MACROS /* To debug the macros. */
632 #undef EXTRACT_NUMBER
633 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
634 #endif /* not EXTRACT_MACROS */
636 #endif /* REGEX_DEBUG_FLAG */
638 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
639 SOURCE must be an lvalue. */
641 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
643 EXTRACT_NUMBER (destination, source); \
647 #ifdef REGEX_DEBUG_FLAG
649 extract_number_and_incr(int *destination, unsigned char **source)
651 extract_number(destination, *source);
655 #ifndef EXTRACT_MACROS
656 #undef EXTRACT_NUMBER_AND_INCR
657 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
658 extract_number_and_incr (&dest, &src)
659 #endif /* not EXTRACT_MACROS */
661 #endif /* REGEX_DEBUG_FLAG */
663 /* If REGEX_DEBUG_FLAG is defined, Regex prints many voluminous messages about
664 what it is doing (if the variable `debug' is nonzero). If linked with the
665 main program in `iregex.c', you can enter patterns and strings interactively.
666 And if linked with the main program in `main.c' and the other test files, you
667 can run the already-written tests. */
669 #if defined (REGEX_DEBUG_FLAG)
671 /* We use standard I/O for debugging. */
675 /* XEmacs provides its own version of assert() */
676 /* It is useful to test things that ``must'' be true when debugging. */
681 /* no point in having this one, since we do not offer any accessor to this */
682 static int debug = 0;
685 #define DEBUG_STATEMENT(e) e
686 #define DEBUG_PRINT1(x) printf(x)
687 #define DEBUG_PRINT2(x1, x2) printf(x1, x2)
688 #define DEBUG_PRINT3(x1, x2, x3) printf(x1, x2, x3)
689 #define DEBUG_PRINT4(x1, x2, x3, x4) printf(x1, x2, x3, x4)
690 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
691 print_partial_compiled_pattern(s, e)
692 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
693 print_double_string (w, s1, sz1, s2, sz2)
695 /* Print the fastmap in human-readable form. */
697 static void print_fastmap(char *fastmap)
699 unsigned was_a_range = 0;
702 while (i < (1 << BYTEWIDTH)) {
706 while (i < (1 << BYTEWIDTH) && fastmap[i]) {
719 /* Print a compiled pattern string in human-readable form, starting at
720 the START pointer into it and ending just before the pointer END. */
722 /* can't help it ... the ugly mule code changes `start' upon alignment,
723 * so this can't have the const qualifier ... bugger */
725 print_partial_compiled_pattern(unsigned char *start, re_char *end)
728 unsigned char *p = start;
736 /* Loop over pattern commands. */
738 printf("%ld:\t", (long)(p - start));
740 switch ((const re_opcode_t)*p++) {
747 printf("/exactn/%d", mcnt);
757 printf("/start_memory/%d/%d", mcnt, *p++);
762 printf("/stop_memory/%d/%d", mcnt, *p++);
766 printf("/duplicate/%d", *p++);
775 REGISTER int c, last = -100;
776 REGISTER int in_range = 0;
778 printf("/charset [%s",
779 (re_opcode_t) * (p - 1) == charset_not
782 assert(p + *p < pend);
784 for (c = 0; c < 256; c++)
785 if (((unsigned char)(c / 8) < *p)
788 /* Are we starting a range? */
789 if (last + 1 == c && !in_range) {
793 /* Have we broken a range? */
794 else if (last + 1 != c
817 case charset_mule_not: {
820 printf("/charset_mule [%s",
821 (re_opcode_t) * (p - 1) == charset_mule_not
823 /* u_r_t_nentries() CHANGES its arg, why on earth
824 * is this marked const here then? :O -hrop */
825 nentries = unified_range_table_nentries(p);
826 for (i = 0; i < nentries; i++) {
827 EMACS_INT first, last;
828 Lisp_Object dummy_val;
830 /* u_r_t_get_range CHANGES its arg ...
831 * p can't be const thence */
832 unified_range_table_get_range(p, i,
839 printf("(0x%lx)", (long)first);
850 p += unified_range_table_bytes_used(p);
863 case on_failure_jump:
864 extract_number_and_incr(&mcnt, &p);
865 printf("/on_failure_jump to %ld",
866 (long)(p + mcnt - start));
869 case on_failure_keep_string_jump:
870 extract_number_and_incr(&mcnt, &p);
871 printf("/on_failure_keep_string_jump to %ld",
872 (long)(p + mcnt - start));
875 case dummy_failure_jump:
876 extract_number_and_incr(&mcnt, &p);
877 printf("/dummy_failure_jump to %ld",
878 (long)(p + mcnt - start));
881 case push_dummy_failure:
882 printf("/push_dummy_failure");
886 extract_number_and_incr(&mcnt, &p);
887 printf("/maybe_pop_jump to %ld",
888 (long)(p + mcnt - start));
891 case pop_failure_jump:
892 extract_number_and_incr(&mcnt, &p);
893 printf("/pop_failure_jump to %ld",
894 (long)(p + mcnt - start));
898 extract_number_and_incr(&mcnt, &p);
899 printf("/jump_past_alt to %ld",
900 (long)(p + mcnt - start));
904 extract_number_and_incr(&mcnt, &p);
905 printf("/jump to %ld", (long)(p + mcnt - start));
909 extract_number_and_incr(&mcnt, &p);
910 extract_number_and_incr(&mcnt2, &p);
911 printf("/succeed_n to %ld, %d times",
912 (long)(p + mcnt - start), mcnt2);
916 extract_number_and_incr(&mcnt, &p);
917 extract_number_and_incr(&mcnt2, &p);
918 printf("/jump_n to %ld, %d times",
919 (long)(p + mcnt - start), mcnt2);
923 extract_number_and_incr(&mcnt, &p);
924 extract_number_and_incr(&mcnt2, &p);
925 printf("/set_number_at location %ld to %d",
926 (long)(p + mcnt - start), mcnt2);
930 printf("/wordbound");
934 printf("/notwordbound");
946 printf("/before_dot");
954 printf("/after_dot");
958 printf("/syntaxspec");
964 printf("/notsyntaxspec");
970 /* 97/2/17 jhod Mule category patch */
972 printf("/categoryspec");
977 case notcategoryspec:
978 printf("/notcategoryspec");
982 /* end of category patch */
991 printf("/notwordchar");
1004 printf("?%d", *(p - 1));
1010 printf("%ld:\tend of pattern.\n", (long)(p - start));
1014 print_compiled_pattern(struct re_pattern_buffer *bufp)
1018 re_char *buffer = bufp->buffer;
1020 unsigned char *buffer = bufp->buffer;
1023 print_partial_compiled_pattern(buffer, buffer + bufp->used);
1024 printf("%ld bytes used/%ld bytes allocated.\n", bufp->used,
1027 if (bufp->fastmap_accurate && bufp->fastmap) {
1028 printf("fastmap: ");
1029 print_fastmap(bufp->fastmap);
1032 printf("re_nsub: %ld\t", (long)bufp->re_nsub);
1033 printf("re_ngroups: %ld\t", (long)bufp->re_ngroups);
1034 printf("regs_alloc: %d\t", bufp->regs_allocated);
1035 printf("can_be_null: %d\t", bufp->can_be_null);
1036 printf("newline_anchor: %d\n", bufp->newline_anchor);
1037 printf("no_sub: %d\t", bufp->no_sub);
1038 printf("not_bol: %d\t", bufp->not_bol);
1039 printf("not_eol: %d\t", bufp->not_eol);
1040 printf("syntax: %d\n", bufp->syntax);
1041 /* Perhaps we should print the translate table? */
1042 /* and maybe the category table? */
1044 if (bufp->external_to_internal_register) {
1047 printf("external_to_internal_register:\n");
1048 for (i = 0; i <= bufp->re_nsub; i++) {
1051 printf("%d -> %d", i,
1052 bufp->external_to_internal_register[i]);
1059 print_double_string(re_char * where, re_char * string1, int size1,
1060 re_char * string2, int size2)
1065 Element_count this_char;
1067 if (FIRST_STRING_P(where)) {
1068 for (this_char = where - string1; this_char < size1;
1070 putchar(string1[this_char]);
1075 for (this_char = where - string2; this_char < size2;
1077 putchar(string2[this_char]);
1081 #else /* not REGEX_DEBUG_FLAG */
1086 #define DEBUG_STATEMENT(e)
1087 #define DEBUG_PRINT1(x)
1088 #define DEBUG_PRINT2(x1, x2)
1089 #define DEBUG_PRINT3(x1, x2, x3)
1090 #define DEBUG_PRINT4(x1, x2, x3, x4)
1091 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1092 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1094 #endif /* REGEX_DEBUG_FLAG */
1096 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1097 also be assigned to arbitrarily: each pattern buffer stores its own
1098 syntax, so it can be changed between regex compilations. */
1099 /* This has no initializer because initialized variables in Emacs
1100 become read-only after dumping. */
1101 reg_syntax_t re_syntax_options;
1103 /* Specify the precise syntax of regexps for compilation. This provides
1104 for compatibility for various utilities which historically have
1105 different, incompatible syntaxes.
1107 The argument SYNTAX is a bit mask comprised of the various bits
1108 defined in regex.h. We return the old syntax. */
1110 reg_syntax_t re_set_syntax(reg_syntax_t syntax)
1112 reg_syntax_t ret = re_syntax_options;
1114 re_syntax_options = syntax;
1118 /* This table gives an error message for each of the error codes listed
1119 in regex.h. Obviously the order here has to be same as there.
1120 POSIX doesn't require that we do anything for REG_NOERROR,
1121 but why not be nice? */
1123 static const char *re_error_msgid[] = {
1124 "Success", /* REG_NOERROR */
1125 "No match", /* REG_NOMATCH */
1126 "Invalid regular expression", /* REG_BADPAT */
1127 "Invalid collation character", /* REG_ECOLLATE */
1128 "Invalid character class name", /* REG_ECTYPE */
1129 "Trailing backslash", /* REG_EESCAPE */
1130 "Invalid back reference", /* REG_ESUBREG */
1131 "Unmatched [ or [^", /* REG_EBRACK */
1132 "Unmatched ( or \\(", /* REG_EPAREN */
1133 "Unmatched \\{", /* REG_EBRACE */
1134 "Invalid content of \\{\\}", /* REG_BADBR */
1135 "Invalid range end", /* REG_ERANGE */
1136 "Memory exhausted", /* REG_ESPACE */
1137 "Invalid preceding regular expression", /* REG_BADRPT */
1138 "Premature end of regular expression", /* REG_EEND */
1139 "Regular expression too big", /* REG_ESIZE */
1140 "Unmatched ) or \\)", /* REG_ERPAREN */
1142 "Invalid syntax designator", /* REG_ESYNTAX */
1145 "Ranges may not span charsets", /* REG_ERANGESPAN */
1146 "Invalid category designator", /* REG_ECATEGORY */
1150 /* Avoiding alloca during matching, to placate r_alloc. */
1152 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1153 searching and matching functions should not call alloca. On some
1154 systems, alloca is implemented in terms of malloc, and if we're
1155 using the relocating allocator routines, then malloc could cause a
1156 relocation, which might (if the strings being searched are in the
1157 ralloc heap) shift the data out from underneath the regexp
1160 Here's another reason to avoid allocation: Emacs
1161 processes input from X in a signal handler; processing X input may
1162 call malloc; if input arrives while a matching routine is calling
1163 malloc, then we're scrod. But Emacs can't just block input while
1164 calling matching routines; then we don't notice interrupts when
1165 they come in. So, Emacs blocks input around all regexp calls
1166 except the matching calls, which it leaves unprotected, in the
1167 faith that they will not malloc. */
1169 /* Normally, this is fine. */
1170 #define MATCH_MAY_ALLOCATE
1172 /* When using GNU C, we are not REALLY using the C alloca, no matter
1173 what config.h may say. So don't take precautions for it. */
1178 /* The match routines may not allocate if (1) they would do it with malloc
1179 and (2) it's not safe for them to use malloc.
1180 Note that if REL_ALLOC is defined, matching would not use malloc for the
1181 failure stack, but we would still use it for the register vectors;
1182 so REL_ALLOC should not affect this. */
1183 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1184 #undef MATCH_MAY_ALLOCATE
1187 /* Failure stack declarations and macros; both re_compile_fastmap and
1188 re_match_2 use a failure stack. These have to be macros because of
1189 REGEX_ALLOCATE_STACK. */
1191 /* Number of failure points for which to initially allocate space
1192 when matching. If this number is exceeded, we allocate more
1193 space, so it is not a hard limit. */
1194 #ifndef INIT_FAILURE_ALLOC
1195 #define INIT_FAILURE_ALLOC 20
1198 /* Roughly the maximum number of failure points on the stack. Would be
1199 exactly that if always used MAX_FAILURE_SPACE each time we failed.
1200 This is a variable only so users of regex can assign to it; we never
1201 change it ourselves. */
1202 #if defined (MATCH_MAY_ALLOCATE) || defined (REGEX_MALLOC)
1203 /* 4400 was enough to cause a crash on Alpha OSF/1,
1204 whose default stack limit is 2mb. */
1205 int re_max_failures = 50000;
1207 int re_max_failures = 2000;
1210 union fail_stack_elt {
1211 const unsigned char *pointer;
1215 typedef union fail_stack_elt fail_stack_elt_t;
1218 fail_stack_elt_t *stack;
1220 Element_count avail; /* Offset of next open position. */
1223 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1224 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1225 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1227 /* Define macros to initialize and free the failure stack.
1228 Do `return -2' if the alloc fails. */
1230 #ifdef MATCH_MAY_ALLOCATE
1231 #define INIT_FAIL_STACK() \
1233 fail_stack.stack = (fail_stack_elt_t *) \
1234 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * \
1235 sizeof (fail_stack_elt_t)); \
1237 if (fail_stack.stack == NULL) \
1240 fail_stack.size = INIT_FAILURE_ALLOC; \
1241 fail_stack.avail = 0; \
1244 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1246 #define INIT_FAIL_STACK() \
1248 fail_stack.avail = 0; \
1251 #define RESET_FAIL_STACK()
1254 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1256 Return 1 if succeeds, and 0 if either ran out of memory
1257 allocating space for it or it was already too large.
1259 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1261 #define DOUBLE_FAIL_STACK(fail_stack) \
1262 ((int) (fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
1264 : ((fail_stack).stack = (fail_stack_elt_t *) \
1265 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1266 (fail_stack).size * sizeof (fail_stack_elt_t), \
1267 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1269 (fail_stack).stack == NULL \
1271 : ((fail_stack).size <<= 1, \
1274 /* Push pointer POINTER on FAIL_STACK.
1275 Return 1 if was able to do so and 0 if ran out of memory allocating
1277 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1278 ((FAIL_STACK_FULL () \
1279 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1281 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1284 /* Push a pointer value onto the failure stack.
1285 Assumes the variable `fail_stack'. Probably should only
1286 be called from within `PUSH_FAILURE_POINT'. */
1287 #define PUSH_FAILURE_POINTER(item) \
1288 fail_stack.stack[fail_stack.avail++].pointer = \
1289 (const unsigned char*)(item)
1291 /* This pushes an integer-valued item onto the failure stack.
1292 Assumes the variable `fail_stack'. Probably should only
1293 be called from within `PUSH_FAILURE_POINT'. */
1294 #define PUSH_FAILURE_INT(item) \
1295 fail_stack.stack[fail_stack.avail++].integer = (item)
1297 /* Push a fail_stack_elt_t value onto the failure stack.
1298 Assumes the variable `fail_stack'. Probably should only
1299 be called from within `PUSH_FAILURE_POINT'. */
1300 #define PUSH_FAILURE_ELT(item) \
1301 fail_stack.stack[fail_stack.avail++] = (item)
1303 /* These three POP... operations complement the three PUSH... operations.
1304 All assume that `fail_stack' is nonempty. */
1305 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1306 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1307 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1309 /* Used to omit pushing failure point id's when we're not debugging. */
1310 #ifdef REGEX_DEBUG_FLAG
1311 #define DEBUG_PUSH PUSH_FAILURE_INT
1312 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1314 #define DEBUG_PUSH(item)
1315 #define DEBUG_POP(item_addr)
1318 /* Push the information about the state we will need
1319 if we ever fail back to it.
1321 Requires variables fail_stack, regstart, regend, reg_info, and
1322 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
1325 Does `return FAILURE_CODE' if runs out of memory. */
1327 #if !defined (REGEX_MALLOC) && !defined (REL_ALLOC)
1328 #define DECLARE_DESTINATION char *destination
1330 #define DECLARE_DESTINATION DECLARE_NOTHING
1333 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1335 DECLARE_DESTINATION; \
1336 /* Must be int, so when we don't save any registers, the\
1337 arithmetic \ of 0 + -1 isn't done as unsigned. */ \
1340 DEBUG_STATEMENT (failure_id++); \
1341 DEBUG_STATEMENT (nfailure_points_pushed++); \
1342 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1343 DEBUG_PRINT2 (" Before push, next avail: %lu\n", \
1344 (long unsigned int)(fail_stack).avail); \
1345 DEBUG_PRINT2 (" size: %lu\n", \
1346 (long unsigned int)(fail_stack).size); \
1348 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
1349 DEBUG_PRINT2 (" available: %ld\n", \
1350 (long) REMAINING_AVAIL_SLOTS); \
1352 /* Ensure we have enough space allocated for what \
1353 * we will push. */ \
1354 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) { \
1355 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1356 return failure_code; \
1358 DEBUG_PRINT2 ("\n Doubled stack; size now: %lu\n", \
1359 (unsigned long) (fail_stack).size); \
1360 DEBUG_PRINT2 (" slots available: %ld\n", \
1361 (long) REMAINING_AVAIL_SLOTS); \
1364 /* Push the info, starting with the registers. */ \
1365 DEBUG_PRINT1 ("\n"); \
1367 for (this_reg = lowest_active_reg; \
1368 this_reg <= highest_active_reg; \
1370 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1371 DEBUG_STATEMENT (num_regs_pushed++); \
1373 DEBUG_PRINT2(" start: %p\n", \
1374 regstart[this_reg]); \
1375 PUSH_FAILURE_POINTER(regstart[this_reg]); \
1377 DEBUG_PRINT2 (" end: %p\n", \
1378 regend[this_reg]); \
1379 PUSH_FAILURE_POINTER(regend[this_reg]); \
1381 DEBUG_PRINT2 (" info: 0x%p\n ", \
1382 ®_info[this_reg]); \
1383 DEBUG_PRINT2 (" match_null=%d", \
1384 REG_MATCH_NULL_STRING_P \
1385 (reg_info[this_reg])); \
1386 DEBUG_PRINT2 (" active=%d", \
1387 IS_ACTIVE (reg_info[this_reg])); \
1388 DEBUG_PRINT2 (" matched_something=%d", \
1390 (reg_info[this_reg])); \
1391 DEBUG_PRINT2 (" ever_matched_something=%d", \
1392 EVER_MATCHED_SOMETHING \
1393 (reg_info[this_reg])); \
1394 DEBUG_PRINT1 ("\n"); \
1395 PUSH_FAILURE_ELT(reg_info[this_reg].word); \
1398 DEBUG_PRINT2 (" Pushing low active reg: %d\n", \
1399 lowest_active_reg); \
1400 PUSH_FAILURE_INT (lowest_active_reg); \
1402 DEBUG_PRINT2 (" Pushing high active reg: %d\n", \
1403 highest_active_reg); \
1404 PUSH_FAILURE_INT (highest_active_reg); \
1406 DEBUG_PRINT2 (" Pushing pattern 0x%lx: \n", \
1407 (long) pattern_place); \
1408 DEBUG_PRINT_COMPILED_PATTERN(bufp, pattern_place, pend); \
1409 PUSH_FAILURE_POINTER (pattern_place); \
1411 DEBUG_PRINT2(" Pushing string %p: `", \
1413 DEBUG_PRINT_DOUBLE_STRING(string_place, \
1416 DEBUG_PRINT1("'\n"); \
1417 PUSH_FAILURE_POINTER (string_place); \
1419 DEBUG_PRINT2(" Pushing failure id: %u\n", failure_id); \
1420 DEBUG_PUSH(failure_id); \
1423 /* This is the number of items that are pushed and popped on the stack
1424 for each register. */
1425 #define NUM_REG_ITEMS 3
1427 /* Individual items aside from the registers. */
1429 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1431 #define NUM_NONREG_ITEMS 4
1434 /* We push at most this many items on the stack. */
1435 /* We used to use (num_regs - 1), which is the number of registers
1436 this regexp will save; but that was changed to 5
1437 to avoid stack overflow for a regexp with lots of parens. */
1438 #define MAX_FAILURE_ITEMS (5 * (NUM_REG_ITEMS + NUM_NONREG_ITEMS))
1440 /* We actually push this many items. */
1441 #define NUM_FAILURE_ITEMS \
1442 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
1445 /* How many items can still be added to the stack without overflowing it. */
1446 #define REMAINING_AVAIL_SLOTS ((int) ((fail_stack).size - (fail_stack).avail))
1448 /* Pops what PUSH_FAIL_STACK pushes.
1450 We restore into the parameters, all of which should be lvalues:
1451 STR -- the saved data position.
1452 PAT -- the saved pattern position.
1453 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1454 REGSTART, REGEND -- arrays of string positions.
1455 REG_INFO -- array of information about each subexpression.
1457 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1458 `pend', `string1', `size1', `string2', and `size2'. */
1460 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, \
1461 regstart, regend, reg_info) \
1464 const unsigned char *string_temp; \
1465 DEBUG_STATEMENT (fail_stack_elt_t ffailure_id); \
1467 assert (!FAIL_STACK_EMPTY ()); \
1469 /* Remove failure points and point to \
1470 * how many regs pushed. */ \
1471 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1472 DEBUG_PRINT2 (" Before pop, next avail: %lu\n", \
1473 (unsigned long) fail_stack.avail); \
1474 DEBUG_PRINT2 (" size: %lu\n", \
1475 (unsigned long) fail_stack.size); \
1477 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1479 DEBUG_POP (&ffailure_id.integer); \
1480 DEBUG_PRINT2 (" Popping failure id: %u\n", \
1481 * (unsigned int *) &ffailure_id); \
1483 /* If the saved string location is NULL, it \
1484 came from an on_failure_keep_string_jump opcode, \
1485 and we want to throw away the saved NULL, thus \
1486 retaining our current position in the string. */ \
1487 string_temp = (const unsigned char *) \
1488 POP_FAILURE_POINTER (); \
1489 if (string_temp != NULL) \
1490 str = string_temp; \
1492 DEBUG_PRINT2(" Popping string %p: `", str); \
1493 DEBUG_PRINT_DOUBLE_STRING(str, \
1496 DEBUG_PRINT1("'\n"); \
1498 pat = (unsigned char *) POP_FAILURE_POINTER(); \
1499 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1500 DEBUG_PRINT_COMPILED_PATTERN(bufp, pat, pend); \
1502 /* Restore register info. */ \
1503 high_reg = (unsigned) POP_FAILURE_INT (); \
1504 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1506 low_reg = (unsigned) POP_FAILURE_INT (); \
1507 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1509 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) { \
1510 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
1512 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1513 DEBUG_PRINT2 (" info: %p\n", \
1514 ®_info[this_reg]); \
1516 regend[this_reg] = POP_FAILURE_POINTER (); \
1517 DEBUG_PRINT2 (" end: %p\n", \
1518 regend[this_reg]); \
1520 regstart[this_reg] = POP_FAILURE_POINTER (); \
1521 DEBUG_PRINT2 (" start: %p\n", \
1522 regstart[this_reg]); \
1525 set_regs_matched_done = 0; \
1526 DEBUG_STATEMENT (nfailure_points_popped++); \
1527 } while (0) /* POP_FAILURE_POINT */
1529 /* Structure for per-register (a.k.a. per-group) information.
1530 Other register information, such as the
1531 starting and ending positions (which are addresses), and the list of
1532 inner groups (which is a bits list) are maintained in separate
1535 We are making a (strictly speaking) nonportable assumption here: that
1536 the compiler will pack our bit fields into something that fits into
1537 the type of `word', i.e., is something that fits into one item on the
1541 fail_stack_elt_t word;
1543 /* This field is one if this group can match the empty string,
1544 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1545 #define MATCH_NULL_UNSET_VALUE 3
1546 unsigned match_null_string_p:2;
1547 unsigned is_active:1;
1548 unsigned matched_something:1;
1549 unsigned ever_matched_something:1;
1551 } register_info_type;
1553 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1554 #define IS_ACTIVE(R) ((R).bits.is_active)
1555 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1556 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1558 /* Call this when have matched a real character; it sets `matched' flags
1559 for the subexpressions which we are currently inside. Also records
1560 that those subexprs have matched. */
1561 #define SET_REGS_MATCHED() \
1564 if (!set_regs_matched_done) \
1567 set_regs_matched_done = 1; \
1568 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1570 MATCHED_SOMETHING (reg_info[r]) \
1571 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1578 /* Registers are set to a sentinel when they haven't yet matched. */
1579 static unsigned char reg_unset_dummy;
1580 #define REG_UNSET_VALUE (®_unset_dummy)
1581 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1583 /* Subroutine declarations and macros for regex_compile. */
1585 /* Fetch the next character in the uncompiled pattern---translating it
1586 if necessary. Also cast from a signed character in the constant
1587 string passed to us by the user to an unsigned char that we can use
1588 as an array index (in, e.g., `translate'). */
1589 #define PATFETCH(c) \
1592 c = TRANSLATE (c); \
1595 /* Fetch the next character in the uncompiled pattern, with no
1597 #define PATFETCH_RAW(c) \
1598 do {if (p == pend) return REG_EEND; \
1599 assert (p < pend); \
1600 c = charptr_emchar (p); \
1604 /* Go backwards one character in the pattern. */
1605 #define PATUNFETCH DEC_CHARPTR (p)
1609 #define PATFETCH_EXTENDED(emch) \
1610 do {if (p == pend) return REG_EEND; \
1611 assert (p < pend); \
1612 emch = charptr_emchar ((const Bufbyte *) p); \
1614 if (TRANSLATE_P (translate) && emch < 0x80) \
1615 emch = (Emchar) (unsigned char) RE_TRANSLATE (emch); \
1618 #define PATFETCH_RAW_EXTENDED(emch) \
1619 do {if (p == pend) return REG_EEND; \
1620 assert (p < pend); \
1621 emch = charptr_emchar ((const Bufbyte *) p); \
1625 #define PATUNFETCH_EXTENDED DEC_CHARPTR (p)
1627 #define PATFETCH_EITHER(emch) \
1629 if (has_extended_chars) \
1630 PATFETCH_EXTENDED (emch); \
1635 #define PATFETCH_RAW_EITHER(emch) \
1637 if (has_extended_chars) \
1638 PATFETCH_RAW_EXTENDED (emch); \
1640 PATFETCH_RAW (emch); \
1643 #define PATUNFETCH_EITHER \
1645 if (has_extended_chars) \
1646 PATUNFETCH_EXTENDED (emch); \
1648 PATUNFETCH (emch); \
1651 #else /* not MULE */
1653 #define PATFETCH_EITHER(emch) PATFETCH (emch)
1654 #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch)
1655 #define PATUNFETCH_EITHER PATUNFETCH
1659 /* If `translate' is non-null, return translate[D], else just D. We
1660 cast the subscript to translate because some data is declared as
1661 `char *', to avoid warnings when a string constant is passed. But
1662 when we use a character as a subscript we must make it unsigned. */
1663 #define TRANSLATE(d) (TRANSLATE_P (translate) ? RE_TRANSLATE (d) : (d))
1665 /* Macros for outputting the compiled pattern into `buffer'. */
1667 /* If the buffer isn't allocated when it comes in, use this. */
1668 #define INIT_BUF_SIZE 32
1670 /* Make sure we have at least N more bytes of space in buffer. */
1671 #define GET_BUFFER_SPACE(n) \
1672 while (buf_end - bufp->buffer + (n) > (ptrdiff_t) bufp->allocated) \
1675 /* Make sure we have one more byte of buffer space and then add C to it. */
1676 #define BUF_PUSH(c) \
1678 GET_BUFFER_SPACE (1); \
1679 *buf_end++ = (unsigned char) (c); \
1682 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1683 #define BUF_PUSH_2(c1, c2) \
1685 GET_BUFFER_SPACE (2); \
1686 *buf_end++ = (unsigned char) (c1); \
1687 *buf_end++ = (unsigned char) (c2); \
1690 /* As with BUF_PUSH_2, except for three bytes. */
1691 #define BUF_PUSH_3(c1, c2, c3) \
1693 GET_BUFFER_SPACE (3); \
1694 *buf_end++ = (unsigned char) (c1); \
1695 *buf_end++ = (unsigned char) (c2); \
1696 *buf_end++ = (unsigned char) (c3); \
1699 /* Store a jump with opcode OP at LOC to location TO. We store a
1700 relative address offset by the three bytes the jump itself occupies. */
1701 #define STORE_JUMP(op, loc, to) \
1702 store_op1 (op, loc, (to) - (loc) - 3)
1704 /* Likewise, for a two-argument jump. */
1705 #define STORE_JUMP2(op, loc, to, arg) \
1706 store_op2 (op, loc, (to) - (loc) - 3, arg)
1708 /* Like `STORE_JUMP', but for inserting. Assume `buf_end' is the
1710 #define INSERT_JUMP(op, loc, to) \
1711 insert_op1 (op, loc, (to) - (loc) - 3, buf_end)
1713 /* Like `STORE_JUMP2', but for inserting. Assume `buf_end' is the
1715 #define INSERT_JUMP2(op, loc, to, arg) \
1716 insert_op2 (op, loc, (to) - (loc) - 3, arg, buf_end)
1718 /* This is not an arbitrary limit: the arguments which represent offsets
1719 into the pattern are two bytes long. So if 2^16 bytes turns out to
1720 be too small, many things would have to change. */
1721 #define MAX_BUF_SIZE (1L << 16)
1723 /* Extend the buffer by twice its current size via realloc and
1724 reset the pointers that pointed into the old block to point to the
1725 correct places in the new one. If extending the buffer results in it
1726 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1727 #define EXTEND_BUFFER() \
1729 re_char *old_buffer = bufp->buffer; \
1730 if (bufp->allocated == MAX_BUF_SIZE) \
1732 bufp->allocated <<= 1; \
1733 if (bufp->allocated > MAX_BUF_SIZE) \
1734 bufp->allocated = MAX_BUF_SIZE; \
1735 bufp->buffer = (unsigned char *)xrealloc( \
1736 bufp->buffer, bufp->allocated); \
1737 if (bufp->buffer == NULL) { \
1738 return REG_ESPACE; \
1740 /* If the buffer moved, move all the pointers into it. */ \
1741 if (old_buffer != bufp->buffer) { \
1742 buf_end = (buf_end - old_buffer) + bufp->buffer; \
1743 begalt = (begalt - old_buffer) + bufp->buffer; \
1744 if (fixup_alt_jump) { \
1746 (fixup_alt_jump - old_buffer) + \
1750 laststart = (laststart - old_buffer) + \
1753 if (pending_exact) { \
1755 (pending_exact - old_buffer) + \
1761 /* Since we have one byte reserved for the register number argument to
1762 {start,stop}_memory, the maximum number of groups we can report
1763 things about is what fits in that byte. */
1764 #define MAX_REGNUM 255
1766 /* But patterns can have more than `MAX_REGNUM' registers. We just
1767 ignore the excess. */
1768 typedef unsigned regnum_t;
1770 #define INIT_REG_TRANSLATE_SIZE 5
1772 /* Macros for the compile stack. */
1774 /* Since offsets can go either forwards or backwards, this type needs to
1775 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1776 typedef int pattern_offset_t;
1779 pattern_offset_t begalt_offset;
1780 pattern_offset_t fixup_alt_jump;
1781 pattern_offset_t inner_group_offset;
1782 pattern_offset_t laststart_offset;
1784 } compile_stack_elt_t;
1787 compile_stack_elt_t *stack;
1789 unsigned avail; /* Offset of next open position. */
1790 } compile_stack_type;
1792 #define INIT_COMPILE_STACK_SIZE 32
1794 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1795 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1797 /* The next available element. */
1798 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1800 /* Set the bit for character C in a bit vector. */
1801 #define SET_LIST_BIT(c) \
1802 (buf_end[((unsigned char) (c)) / BYTEWIDTH] \
1803 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1807 /* Set the "bit" for character C in a range table. */
1808 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt)
1810 /* Set the "bit" for character c in the appropriate table. */
1811 #define SET_EITHER_BIT(c) \
1813 if (has_extended_chars) \
1814 SET_RANGETAB_BIT (c); \
1819 #else /* not MULE */
1821 #define SET_EITHER_BIT(c) SET_LIST_BIT (c)
1825 /* Get the next unsigned number in the uncompiled pattern. */
1826 #define GET_UNSIGNED_NUMBER(num) \
1830 while (ISDIGIT (c)) \
1834 num = num * 10 + c - '0'; \
1842 #define CHAR_CLASS_MAX_LENGTH 9 /* Namely, `multibyte'. */
1845 static void store_op1(re_opcode_t op, unsigned char *loc, int arg);
1846 static void store_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1847 static void insert_op1(re_opcode_t op, unsigned char *loc, int arg,
1848 unsigned char *end);
1849 static void insert_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2,
1850 unsigned char *end);
1851 static re_bool at_begline_loc_p(re_char*, re_char*, reg_syntax_t);
1852 static re_bool at_endline_loc_p(re_char*, re_char*, int syntax);
1853 static re_bool group_in_compile_stack(compile_stack_type compile_stack,
1855 static reg_errcode_t compile_range(re_char ** p_ptr, re_char * pend,
1856 RE_TRANSLATE_TYPE translate,
1857 reg_syntax_t syntax, unsigned char *b);
1859 static reg_errcode_t compile_extended_range(re_char ** p_ptr,
1861 RE_TRANSLATE_TYPE translate,
1862 reg_syntax_t syntax,
1865 static re_bool group_match_null_string_p(unsigned char **p,
1867 register_info_type * reg_info);
1868 static re_bool alt_match_null_string_p(unsigned char *p, unsigned char *end,
1869 register_info_type * reg_info);
1870 static re_bool common_op_match_null_string_p(unsigned char **p,
1872 register_info_type * reg_info);
1873 static int bcmp_translate(const unsigned char *s1, const unsigned char *s2,
1874 REGISTER int len, RE_TRANSLATE_TYPE translate);
1875 static int re_match_2_internal(struct re_pattern_buffer *bufp,
1876 re_char * string1, int size1,
1877 re_char * string2, int size2, int pos,
1878 struct re_registers *regs, int stop);
1880 #ifndef MATCH_MAY_ALLOCATE
1882 /* If we cannot allocate large objects within re_match_2_internal,
1883 we make the fail stack and register vectors global.
1884 The fail stack, we grow to the maximum size when a regexp
1886 The register vectors, we adjust in size each time we
1887 compile a regexp, according to the number of registers it needs. */
1889 static fail_stack_type fail_stack;
1891 /* Size with which the following vectors are currently allocated.
1892 That is so we can make them bigger as needed,
1893 but never make them smaller. */
1894 static int regs_allocated_size;
1896 static re_char **regstart, **regend;
1897 static re_char **old_regstart, **old_regend;
1898 static re_char **best_regstart, **best_regend;
1899 static register_info_type *reg_info;
1900 static re_char **reg_dummy;
1901 static register_info_type *reg_info_dummy;
1903 /* Make the register vectors big enough for NUM_REGS registers,
1904 but don't make them smaller. */
1906 static void regex_grow_registers(int num_regs)
1908 if (num_regs > regs_allocated_size) {
1909 RETALLOC_IF(regstart, num_regs, re_char *);
1910 RETALLOC_IF(regend, num_regs, re_char *);
1911 RETALLOC_IF(old_regstart, num_regs, re_char *);
1912 RETALLOC_IF(old_regend, num_regs, re_char *);
1913 RETALLOC_IF(best_regstart, num_regs, re_char *);
1914 RETALLOC_IF(best_regend, num_regs, re_char *);
1915 RETALLOC_IF(reg_info, num_regs, register_info_type);
1916 RETALLOC_IF(reg_dummy, num_regs, re_char *);
1917 RETALLOC_IF(reg_info_dummy, num_regs, register_info_type);
1919 regs_allocated_size = num_regs;
1923 #endif /* not MATCH_MAY_ALLOCATE */
1925 /* auxiliary stuff, FSF theft */
1926 /* Character classes. */
1929 RECC_ALNUM, RECC_ALPHA, RECC_WORD,
1930 RECC_GRAPH, RECC_PRINT,
1931 RECC_LOWER, RECC_UPPER,
1932 RECC_PUNCT, RECC_CNTRL,
1933 RECC_DIGIT, RECC_XDIGIT,
1934 RECC_BLANK, RECC_SPACE,
1935 RECC_MULTIBYTE, RECC_NONASCII,
1936 RECC_ASCII, RECC_UNIBYTE
1939 /* Map a string to the char class it names (if any). */
1940 static inline re_wctype_t
1941 re_wctype(re_char *str)
1943 const char *string = (const char*)str;
1945 if (STREQ (string, "alnum")) {
1947 } else if (STREQ (string, "alpha")) {
1949 } else if (STREQ (string, "digit")) {
1951 } else if (STREQ (string, "graph")) {
1953 } else if (STREQ (string, "space")) {
1955 } else if (STREQ (string, "upper")) {
1957 } else if (STREQ (string, "lower")) {
1959 } else if (STREQ (string, "word")) {
1961 } else if (STREQ (string, "print")) {
1963 } else if (STREQ (string, "punct")) {
1965 } else if (STREQ (string, "xdigit")) {
1967 } else if (STREQ (string, "cntrl")) {
1969 } else if (STREQ (string, "blank")) {
1971 } else if (STREQ (string, "ascii")) {
1973 } else if (STREQ (string, "nonascii")) {
1974 return RECC_NONASCII;
1975 } else if (STREQ (string, "unibyte")) {
1976 return RECC_UNIBYTE;
1977 } else if (STREQ (string, "multibyte")) {
1978 return RECC_MULTIBYTE;
1984 /* True if CH is in the char class CC. */
1986 re_iswctype(int ch, re_wctype_t cc)
1990 return ISALNUM (ch);
1992 return ISALPHA (ch);
1994 return ISBLANK (ch);
1996 return ISCNTRL (ch);
1998 return ISDIGIT (ch);
2000 return ISGRAPH (ch);
2002 return ISLOWER (ch);
2004 return ISPRINT (ch);
2006 return ISPUNCT (ch);
2008 return ISSPACE (ch);
2010 return ISUPPER (ch);
2012 return ISXDIGIT (ch);
2014 return IS_REAL_ASCII (ch);
2016 return !IS_REAL_ASCII (ch);
2020 return ISUNIBYTE((unsigned int)ch);
2021 case RECC_MULTIBYTE:
2022 return !ISUNIBYTE((unsigned int)ch);
2032 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2033 Returns one of error codes defined in `regex.h', or zero for success.
2035 Assumes the `allocated' (and perhaps `buffer') and `translate'
2036 fields are set in BUFP on entry.
2038 If it succeeds, results are put in BUFP (if it returns an error, the
2039 contents of BUFP are undefined):
2040 `buffer' is the compiled pattern;
2041 `syntax' is set to SYNTAX;
2042 `used' is set to the length of the compiled pattern;
2043 `fastmap_accurate' is zero;
2044 `re_ngroups' is the number of groups/subexpressions (including shy
2046 `re_nsub' is the number of non-shy groups in PATTERN;
2047 `not_bol' and `not_eol' are zero;
2049 The `fastmap' and `newline_anchor' fields are neither
2050 examined nor set. */
2052 static reg_errcode_t
2053 regex_compile(re_char * pattern, int size, reg_syntax_t syntax,
2054 struct re_pattern_buffer *bufp)
2056 /* We fetch characters from PATTERN here. We declare these as int
2057 (or possibly long) so that chars above 127 can be used as
2058 array indices. The macros that fetch a character from the pattern
2059 make sure to coerce to unsigned char before assigning, so we won't
2060 get bitten by negative numbers here. */
2061 /* XEmacs change: used to be unsigned char. */
2062 REGISTER EMACS_INT c, c1;
2064 /* A random temporary spot in PATTERN. */
2067 /* Points to the end of the buffer, where we should append. */
2068 REGISTER unsigned char *buf_end;
2070 /* Keeps track of unclosed groups. */
2071 compile_stack_type compile_stack;
2073 /* Points to the current (ending) position in the pattern. */
2074 re_char *p = pattern;
2075 re_char *pend = pattern + size;
2077 /* How to translate the characters in the pattern. */
2078 RE_TRANSLATE_TYPE translate = bufp->translate;
2080 /* Address of the count-byte of the most recently inserted `exactn'
2081 command. This makes it possible to tell if a new exact-match
2082 character can be added to that command or if the character requires
2083 a new `exactn' command. */
2084 unsigned char *pending_exact = 0;
2086 /* Address of start of the most recently finished expression.
2087 This tells, e.g., postfix * where to find the start of its
2088 operand. Reset at the beginning of groups and alternatives. */
2089 unsigned char *laststart = 0;
2091 /* Address of beginning of regexp, or inside of last group. */
2092 unsigned char *begalt;
2094 /* Place in the uncompiled pattern (i.e., the {) to
2095 which to go back if the interval is invalid. */
2096 re_char *beg_interval;
2098 /* Address of the place where a forward jump should go to the end of
2099 the containing expression. Each alternative of an `or' -- except the
2100 last -- ends with a forward jump of this sort. */
2101 unsigned char *fixup_alt_jump = 0;
2103 /* Counts open-groups as they are encountered. Remembered for the
2104 matching close-group on the compile stack, so the same register
2105 number is put in the stop_memory as the start_memory. */
2106 regnum_t regnum = 0;
2108 /* we need to close over compile_stack */
2109 # define free_stack(args...) xfree(compile_stack.stack)
2111 #ifdef REGEX_DEBUG_FLAG
2112 DEBUG_PRINT1("\nCompiling pattern: ");
2116 for (debug_count = 0; debug_count < size; debug_count++)
2117 putchar(pattern[debug_count]);
2122 /* Initialize the compile stack. */
2123 compile_stack.stack =
2124 TALLOC(INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2125 if (compile_stack.stack == NULL) {
2128 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2129 compile_stack.avail = 0;
2131 /* Initialize the pattern buffer. */
2132 bufp->syntax = syntax;
2133 bufp->fastmap_accurate = 0;
2134 bufp->not_bol = bufp->not_eol = 0;
2136 /* Set `used' to zero, so that if we return an error, the pattern
2137 printer (for debugging) will think there's no pattern. We reset it
2141 /* Always count groups, whether or not bufp->no_sub is set. */
2143 bufp->re_ngroups = 0;
2145 /* Allocate index translation array if needed. */
2146 if (bufp->external_to_internal_register == 0) {
2147 bufp->external_to_internal_register_size =
2148 INIT_REG_TRANSLATE_SIZE;
2149 RETALLOC(bufp->external_to_internal_register,
2150 bufp->external_to_internal_register_size, int);
2152 /* Initialize translations to impossible value to aid debugging. */
2156 bufp->external_to_internal_register[0] = 0;
2157 for (i = 1; i < bufp->external_to_internal_register_size; i++)
2158 bufp->external_to_internal_register[i] =
2162 #if !defined (emacs) && !defined (SYNTAX_TABLE)
2163 /* Initialize the syntax table. */
2167 if (bufp->allocated == 0) {
2169 /* If zero allocated, but buffer is non-null, try to
2170 realloc enough space. This loses if buffer's address
2171 is bogus, but that is the user's responsibility. */
2172 RETALLOC(bufp->buffer, INIT_BUF_SIZE, unsigned char);
2174 /* Caller did not allocate a buffer. Do it for them. */
2175 bufp->buffer = TALLOC(INIT_BUF_SIZE, unsigned char);
2177 if (!bufp->buffer) {
2182 bufp->allocated = INIT_BUF_SIZE;
2185 begalt = buf_end = bufp->buffer;
2187 /* Loop through the uncompiled pattern until we're at the end. */
2193 if ( /* If at start of pattern, it's an operator. */
2195 /* If context independent, it's an operator. */
2196 || syntax & RE_CONTEXT_INDEP_ANCHORS
2197 /* Otherwise, depends on what's come before. */
2198 || at_begline_loc_p(pattern, p, syntax))
2206 if ( /* If at end of pattern, it's an
2209 /* If context independent, it's an
2211 || syntax & RE_CONTEXT_INDEP_ANCHORS
2212 /* Otherwise, depends on what's
2214 || at_endline_loc_p(p, pend, syntax))
2223 if ((syntax & RE_BK_PLUS_QM) ||
2224 (syntax & RE_LIMITED_OPS)) {
2230 re_bool zero_times_ok;
2231 re_bool many_times_ok;
2234 /* If there is no previous pattern... */
2236 if (syntax & RE_CONTEXT_INVALID_OPS) {
2239 } else if (!(syntax & RE_CONTEXT_INDEP_OPS)) {
2244 /* true means zero/many matches are allowed. */
2245 zero_times_ok = c != '+';
2246 many_times_ok = c != '?';
2248 /* true means match shortest string possible. */
2251 /* If there is a sequence of repetition chars, collapse it
2252 down to just one (the right one). We can't combine
2253 interval operators with these because of, e.g., `a{2}*',
2254 which should only match an even number of `a's. */
2259 || (!(syntax & RE_BK_PLUS_QM)
2260 && (c == '+' || c == '?'))) ;
2262 else if (syntax & RE_BK_PLUS_QM
2270 if (!(c1 == '+' || c1 == '?')) {
2282 /* If we get here, we found another repeat
2284 if (!(syntax & RE_NO_MINIMAL_MATCHING)) {
2285 /* "*?" and "+?" and "??" are okay (and
2286 mean match minimally), but other
2287 sequences (such as "*??" and "+++")
2288 are rejected (reserved for future
2290 if (minimal || c != '?') {
2296 zero_times_ok |= c != '+';
2297 many_times_ok |= c != '?';
2301 /* Star, etc. applied to an empty pattern is equivalent
2302 to an empty pattern. */
2307 /* Now we know whether zero matches is allowed
2308 and whether two or more matches is allowed
2309 and whether we want minimal or maximal matching. */
2311 if (!many_times_ok) {
2313 0: /on_failure_jump to 6
2318 GET_BUFFER_SPACE(6);
2319 INSERT_JUMP(jump, laststart,
2322 INSERT_JUMP(on_failure_jump,
2326 } else if (zero_times_ok) {
2330 6: /on_failure_jump to 3
2333 GET_BUFFER_SPACE(6);
2334 INSERT_JUMP(jump, laststart,
2337 STORE_JUMP(on_failure_jump,
2344 3: /on_failure_jump to 0
2347 GET_BUFFER_SPACE(3);
2348 STORE_JUMP(on_failure_jump,
2349 buf_end, laststart);
2353 /* Are we optimizing this jump? */
2354 re_bool keep_string_p = false;
2356 if (many_times_ok) {
2357 /* More than one repetition is allowed,
2358 so put in at the end a backward
2359 relative jump from `buf_end' to
2360 before the next jump we're going to
2361 put in below (which jumps from
2362 laststart to after this jump).
2364 But if we are at the `*' in the exact
2365 sequence `.*\n', insert an
2366 unconditional jump backwards to the
2367 ., instead of the beginning of the
2368 loop. This way we only push a
2369 failure point once, instead of every
2370 time through the loop. */
2371 assert(p - 1 > pattern);
2373 /* Allocate the space for the jump. */
2374 GET_BUFFER_SPACE(3);
2376 /* We know we are not at the first
2377 character of the pattern, because
2378 laststart was nonzero. And we've
2379 already incremented `p', by the way,
2380 to be the character after the `*'.
2381 Do we have to do something analogous
2382 here for null bytes, because of
2384 if (*(p - 2) == '.' &&
2388 !(syntax & RE_DOT_NEWLINE)) {
2393 keep_string_p = true;
2395 /* Anything else. */
2401 /* We've added more stuff to the
2406 /* On failure, jump from laststart to buf_end +
2407 3, which will be the end of the buffer after
2408 this jump is inserted. */
2409 GET_BUFFER_SPACE(3);
2410 INSERT_JUMP(keep_string_p ?
2411 on_failure_keep_string_jump
2413 laststart, buf_end + 3);
2416 if (!zero_times_ok) {
2417 /* At least one repetition is required,
2418 so insert a `dummy_failure_jump'
2419 before the initial `on_failure_jump'
2420 instruction of the loop. This effects
2421 a skip over that instruction the
2422 first time we hit that loop. */
2423 GET_BUFFER_SPACE(3);
2424 INSERT_JUMP(dummy_failure_jump,
2435 laststart = buf_end;
2440 /* XEmacs change: this whole section */
2441 re_bool had_char_class = false;
2443 re_bool has_extended_chars = false;
2444 REGISTER Lisp_Object rtab = Qnil;
2452 /* Ensure that we have enough space to push a charset:
2453 the opcode, the length count, and the bitset; 34
2455 GET_BUFFER_SPACE(34);
2457 laststart = buf_end;
2459 /* We test `*p == '^' twice, instead of using an if
2460 statement, so we only need one BUF_PUSH. */
2461 BUF_PUSH(*p == '^' ? charset_not : charset);
2465 /* Remember the first position in the bracket
2469 /* Push the number of bytes in the bitmap. */
2470 BUF_PUSH((1 << BYTEWIDTH) / BYTEWIDTH);
2472 /* Clear the whole map. */
2473 memset(buf_end, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2475 /* charset_not matches newline according to a syntax
2477 if ((re_opcode_t) buf_end[-2] == charset_not
2478 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2482 start_over_with_extended:
2483 if (has_extended_chars) {
2484 /* There are extended chars here, which means we
2485 need to start over and shift to unified
2486 range-table format. */
2487 if (buf_end[-2] == charset)
2488 buf_end[-2] = charset_mule;
2490 buf_end[-2] = charset_mule_not;
2493 /* go back to the beginning of the charset,
2494 after a possible ^. */
2495 rtab = Vthe_lisp_rangetab;
2496 Fclear_range_table(rtab);
2498 /* charset_not matches newline according to a
2500 if ((re_opcode_t) buf_end[-1] ==
2503 RE_HAT_LISTS_NOT_NEWLINE))
2504 SET_EITHER_BIT('\n');
2508 /* Read in characters and ranges, setting map bits. */
2517 if (c >= 0x80 && !has_extended_chars) {
2518 has_extended_chars = 1;
2519 /* Frumble-bumble, we've found some
2520 extended chars. Need to start over,
2521 process everything using the general
2522 extended-char mechanism, and need to
2523 use charset_mule and charset_mule_not
2524 instead of charset and
2526 goto start_over_with_extended;
2529 /* \ might escape characters inside [...] and
2531 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) &&
2541 && !has_extended_chars) {
2542 has_extended_chars = 1;
2543 goto start_over_with_extended;
2550 /* Could be the end of the bracket
2551 expression. If it's not (i.e., when
2552 the bracket expression is `[]' so
2553 far), the ']' character bit gets set
2555 if (c == ']' && p != p1 + 1)
2558 /* Look ahead to see if it's a range
2559 when the last thing was a character
2561 if (had_char_class && c == '-'
2567 /* Look ahead to see if it's a range
2568 when the last thing was a character:
2569 if this is a hyphen not at the
2570 beginning or the end of a list, then
2571 it's the range operator. */
2573 && !(p - 2 >= pattern
2575 && !(p - 3 >= pattern
2582 if (*(const unsigned char *)p >= 0x80
2583 && !has_extended_chars) {
2584 has_extended_chars = 1;
2585 goto start_over_with_extended;
2587 if (has_extended_chars)
2588 ret = compile_extended_range
2589 (&p, pend, translate,
2594 (&p, pend, translate,
2596 if (ret != REG_NOERROR) {
2602 else if (p[0] == '-' && p[1] != ']') {
2603 /* This handles ranges made up of
2607 /* Move past the `-'. */
2611 if (*(const unsigned char*)p >= 0x80
2612 && !has_extended_chars) {
2613 has_extended_chars = 1;
2614 goto start_over_with_extended;
2616 if (has_extended_chars)
2617 ret = compile_extended_range(
2618 &p, pend, translate,
2622 ret = compile_range(
2623 &p, pend, translate,
2625 if (ret != REG_NOERROR) {
2631 /* See if we're at the beginning of a possible
2634 else if (syntax & RE_CHAR_CLASSES &&
2635 c == '[' && *p == ':') {
2636 /* Leave room for the null. */
2637 char str[CHAR_CLASS_MAX_LENGTH + 1];
2642 /* If pattern is `[[:'. */
2648 /* #### This code is unused.
2649 Correctness is not checked
2650 after TRT table change. */
2652 if (c == ':' || c == ']'
2655 CHAR_CLASS_MAX_LENGTH)
2657 str[c1++] = (char)c;
2661 /* If isn't a word bracketed by
2662 `[:' and `:]': undo the
2663 ending character, the
2664 letters, and leave the
2665 leading `:' and `[' (but set
2667 if (c == ':' && *p == ']') {
2673 if (wct == RECC_ERROR) {
2678 /* Throw away the ] at
2698 re_iswctype(ch, wct)) {
2702 if (re_iswctype(ch, wct)) {
2707 had_char_class = true;
2713 SET_EITHER_BIT('[');
2714 SET_EITHER_BIT(':');
2715 had_char_class = false;
2718 had_char_class = false;
2724 if (has_extended_chars) {
2725 /* We have a range table, not a bit vector. */
2727 unified_range_table_bytes_needed
2729 GET_BUFFER_SPACE(bytes_needed);
2730 unified_range_table_copy_data(rtab,
2733 unified_range_table_bytes_used
2738 /* Discard any (non)matching list bytes that are
2739 all 0 at the end of the map. Decrease the
2740 map-length byte too. */
2741 while ((int)buf_end[-1] > 0
2742 && buf_end[buf_end[-1] - 1] == 0)
2744 buf_end += buf_end[-1];
2749 if (syntax & RE_NO_BK_PARENS)
2755 if (syntax & RE_NO_BK_PARENS)
2761 if (syntax & RE_NEWLINE_ALT)
2767 if (syntax & RE_NO_BK_VBAR)
2773 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2774 goto handle_interval;
2784 /* Do not translate the character after the \, so that we can
2785 distinguish, e.g., \B from \b, even if we normally would
2786 translate, e.g., B to b. */
2791 if (syntax & RE_NO_BK_PARENS)
2792 goto normal_backslash;
2799 if (!(syntax & RE_NO_SHY_GROUPS)
2800 && p != pend && *p == '?') {
2821 /* Record the translation from capturing group index to
2822 register number, reallocating table as needed. */
2825 external_to_internal_register_size
2830 external_to_internal_register_size;
2832 external_to_internal_register_size
2835 external_to_internal_register,
2837 external_to_internal_register_size,
2843 external_to_internal_register_size;
2846 external_to_internal_register
2853 external_to_internal_register
2858 if (COMPILE_STACK_FULL) {
2859 RETALLOC(compile_stack.stack,
2862 compile_stack_elt_t);
2863 if (compile_stack.stack == NULL)
2866 compile_stack.size <<= 1;
2869 /* These are the values to restore when
2870 we hit end of this group. They are
2871 all relative offsets, so that if the
2872 whole pattern moves because of
2873 realloc, they will still be
2875 COMPILE_STACK_TOP.begalt_offset =
2876 begalt - bufp->buffer;
2877 COMPILE_STACK_TOP.fixup_alt_jump =
2882 COMPILE_STACK_TOP.laststart_offset =
2883 buf_end - bufp->buffer;
2884 COMPILE_STACK_TOP.regnum = r;
2886 /* We will eventually replace the 0 with
2887 the number of groups inner to this
2888 one. But do not push a start_memory
2889 for groups beyond the last one we can
2890 represent in the compiled pattern.
2891 #### bad bad bad. this will fail in
2892 lots of ways, if we ever have to
2893 backtrack for these groups.
2895 if (r <= MAX_REGNUM) {
2897 inner_group_offset =
2898 buf_end - bufp->buffer +
2900 BUF_PUSH_3(start_memory, r, 0);
2903 compile_stack.avail++;
2908 /* If we've reached MAX_REGNUM groups,
2909 then this open won't actually
2910 generate any code, so we'll have to
2911 clear pending_exact explicitly. */
2917 if (syntax & RE_NO_BK_PARENS)
2918 goto normal_backslash;
2920 if (COMPILE_STACK_EMPTY) {
2922 RE_UNMATCHED_RIGHT_PAREN_ORD) {
2923 goto normal_backslash;
2931 if (fixup_alt_jump) {
2932 /* Push a dummy failure point at the end
2933 of the alternative for a possible
2934 future `pop_failure_jump' to pop.
2935 See comments at `push_dummy_failure'
2937 BUF_PUSH(push_dummy_failure);
2939 /* We allocated space for this jump when
2940 we assigned to `fixup_alt_jump', in
2941 the `handle_alt' case below. */
2942 STORE_JUMP(jump_past_alt,
2943 fixup_alt_jump, buf_end - 1);
2946 /* See similar code for backslashed left paren
2948 if (COMPILE_STACK_EMPTY) {
2950 RE_UNMATCHED_RIGHT_PAREN_ORD) {
2958 /* Since we just checked for an empty stack
2959 above, this ``can't happen''. */
2960 assert(compile_stack.avail != 0);
2962 /* We don't just want to restore into
2963 `regnum', because later groups should
2964 continue to be numbered higher, as in
2965 `(ab)c(de)' -- the second group is
2967 regnum_t this_group_regnum;
2969 compile_stack.avail--;
2972 COMPILE_STACK_TOP.begalt_offset;
2985 COMPILE_STACK_TOP.regnum;
2986 /* If we've reached MAX_REGNUM groups,
2987 then this open won't actually
2988 generate any code, so we'll have to
2989 clear pending_exact explicitly. */
2992 /* We're at the end of the group, so now
2993 we know how many groups were inside
2995 if (this_group_regnum <= MAX_REGNUM) {
2996 unsigned char *inner_group_loc
3005 BUF_PUSH_3(stop_memory,
3013 case '|': /* `\|'. */
3014 if (syntax & RE_LIMITED_OPS
3015 || syntax & RE_NO_BK_VBAR) {
3016 goto normal_backslash;
3019 if (syntax & RE_LIMITED_OPS) {
3023 /* Insert before the previous alternative a jump
3024 which jumps to this alternative if the former
3026 GET_BUFFER_SPACE(3);
3027 INSERT_JUMP(on_failure_jump, begalt,
3032 /* The alternative before this one has a jump after it
3033 which gets executed if it gets matched. Adjust that
3034 jump so it will jump to this alternative's analogous
3035 jump (put in below, which in turn will jump to the next
3036 (if any) alternative's such jump, etc.). The last such
3037 jump jumps to the correct final destination. A picture:
3043 If we are at `b', then fixup_alt_jump right now points to a
3044 three-byte space after `a'. We'll put in the jump, set
3045 fixup_alt_jump to right after `b', and leave behind three
3046 bytes which we'll fill in when we get to after `c'. */
3049 STORE_JUMP(jump_past_alt,
3050 fixup_alt_jump, buf_end);
3052 /* Mark and leave space for a jump after this alternative,
3053 to be filled in later either by next alternative or
3054 when know we're at the end of a series of alternatives. */
3055 fixup_alt_jump = buf_end;
3056 GET_BUFFER_SPACE(3);
3064 /* If \{ is a literal. */
3065 if (!(syntax & RE_INTERVALS)
3066 /* If we're at `\{' and it's not the open-interval
3068 || ((syntax & RE_INTERVALS)
3069 && (syntax & RE_NO_BK_BRACES))
3070 || (p - 2 == pattern && p == pend))
3071 goto normal_backslash;
3075 /* If got here, then the syntax allows
3078 /* At least (most) this many matches
3080 int lower_bound = -1, upper_bound = -1;
3082 beg_interval = p - 1;
3085 if (syntax & RE_NO_BK_BRACES) {
3086 goto unfetch_interval;
3093 GET_UNSIGNED_NUMBER(lower_bound);
3096 GET_UNSIGNED_NUMBER(
3098 if (upper_bound < 0) {
3103 /* Interval such as `{1}' =>
3104 match exactly once. */
3105 upper_bound = lower_bound;
3109 || upper_bound > RE_DUP_MAX
3110 || lower_bound > upper_bound) {
3111 if (syntax & RE_NO_BK_BRACES) {
3112 goto unfetch_interval;
3119 if (!(syntax & RE_NO_BK_BRACES)) {
3128 if (syntax & RE_NO_BK_BRACES) {
3129 goto unfetch_interval;
3136 /* We just parsed a valid interval. */
3138 /* If it's invalid to have no preceding
3142 RE_CONTEXT_INVALID_OPS) {
3147 RE_CONTEXT_INDEP_OPS) {
3148 laststart = buf_end;
3150 goto unfetch_interval;
3154 /* If the upper bound is zero, don't
3155 want to succeed at all; jump from
3156 `laststart' to `b + 3', which will be
3157 the end of the buffer after we insert
3159 if (upper_bound == 0) {
3160 GET_BUFFER_SPACE(3);
3161 INSERT_JUMP(jump, laststart,
3166 /* Otherwise, we have a nontrivial interval. When
3167 we're all done, the pattern will look like:
3168 set_number_at <jump count> <upper bound>
3169 set_number_at <succeed_n count> <lower bound>
3170 succeed_n <after jump addr> <succeed_n count>
3172 jump_n <succeed_n addr> <jump count>
3173 (The upper bound and `jump_n' are omitted if
3174 `upper_bound' is 1, though.) */
3175 else { /* If the upper bound is > 1, we need to insert
3176 more at the end of the loop. */
3177 Memory_count nbytes =
3178 10 + (upper_bound > 1) * 10;
3180 GET_BUFFER_SPACE(nbytes);
3182 /* Initialize lower bound of the `succeed_n', even
3183 though it will be set during matching by its
3184 attendant `set_number_at' (inserted next),
3185 because `re_compile_fastmap' needs to know.
3186 Jump to the `jump_n' we might insert below. */
3187 INSERT_JUMP2(succeed_n,
3195 /* Code to initialize the lower bound. Insert
3196 before the `succeed_n'. The `5' is the last two
3197 bytes of this `set_number_at', plus 3 bytes of
3198 the following `succeed_n'. */
3199 insert_op2(set_number_at,
3205 if (upper_bound > 1) { /* More than one repetition is allowed, so
3206 append a backward jump to the `succeed_n'
3207 that starts this interval.
3209 When we've reached this during matching,
3210 we'll have matched the interval once, so
3211 jump back only `upper_bound - 1' times. */
3220 /* The location we want to set is the second
3221 parameter of the `jump_n'; that is `b-2' as
3222 an absolute address. `laststart' will be
3223 the `set_number_at' we're about to insert;
3224 `laststart+3' the number to set, the source
3225 for the relative address. But we are
3226 inserting into the middle of the pattern --
3227 so everything is getting moved up by 5.
3228 Conclusion: (b - 2) - (laststart + 3) + 5,
3229 i.e., b - laststart.
3231 We insert this at the beginning of the loop
3232 so that if we fail during matching, we'll
3233 reinitialize the bounds. */
3245 beg_interval = NULL;
3250 /* If an invalid interval, match the characters as literals. */
3251 assert(beg_interval);
3253 beg_interval = NULL;
3255 /* normal_char and normal_backslash need `c'. */
3258 if (!(syntax & RE_NO_BK_BRACES)) {
3259 if (p > pattern && p[-1] == '\\')
3260 goto normal_backslash;
3265 /* There is no way to specify the before_dot and after_dot
3266 operators. rms says this is ok. --karl */
3272 laststart = buf_end;
3274 /* XEmacs addition */
3275 if (c >= 0x80 || syntax_spec_code[c] == 0377) {
3279 BUF_PUSH_2(syntaxspec, syntax_spec_code[c]);
3283 laststart = buf_end;
3285 /* XEmacs addition */
3286 if (c >= 0x80 || syntax_spec_code[c] == 0377) {
3290 BUF_PUSH_2(notsyntaxspec, syntax_spec_code[c]);
3294 /* 97.2.17 jhod merged in to XEmacs from mule-2.3 */
3296 laststart = buf_end;
3298 if (c < 32 || c > 127) {
3300 return REG_ECATEGORY;
3302 BUF_PUSH_2(categoryspec, c);
3306 laststart = buf_end;
3308 if (c < 32 || c > 127) {
3310 return REG_ECATEGORY;
3312 BUF_PUSH_2(notcategoryspec, c);
3314 /* end of category patch */
3319 laststart = buf_end;
3324 laststart = buf_end;
3325 BUF_PUSH(notwordchar);
3337 BUF_PUSH(wordbound);
3341 BUF_PUSH(notwordbound);
3363 if (syntax & RE_NO_BK_REFS) {
3366 /* External register indexing. */
3369 if (reg > bufp->re_nsub) {
3374 /* Convert external to internal as soon
3376 reg = bufp->external_to_internal_register[reg];
3378 /* Can't back reference to a
3379 subexpression if inside it. */
3380 if (group_in_compile_stack(
3381 compile_stack, reg)) {
3384 laststart = buf_end;
3385 BUF_PUSH_2(duplicate, reg);
3391 if (syntax & RE_BK_PLUS_QM) {
3394 goto normal_backslash;
3398 /* You might think it would be useful for \ to
3399 mean not to translate; but if we don't
3400 translate it, it will never match
3407 /* Expects the character in `c'. */
3408 /* `p' points to the location after where `c' came
3412 /* XEmacs: modifications here for Mule. */
3413 /* `q' points to the beginning of the next char. */
3416 /* If no exactn currently being built. */
3418 /* If last exactn not at current position. */
3419 || pending_exact + *pending_exact + 1 !=
3421 /* We have only one byte following the exactn for
3423 || ((unsigned int)(*pending_exact + (q - p)) >=
3424 ((unsigned int)(1 << BYTEWIDTH) - 1))
3426 /* If followed by a repetition operator. */
3427 || *q == '*' || *q == '^'
3428 || ((syntax & RE_BK_PLUS_QM)
3429 ? *q == '\\' && (q[1] == '+'
3431 : (*q == '+' || *q == '?'))
3432 || ((syntax & RE_INTERVALS)
3433 && ((syntax & RE_NO_BK_BRACES)
3435 : (q[0] == '\\' && q[1] == '{')))) {
3436 /* Start building a new exactn. */
3438 laststart = buf_end;
3440 BUF_PUSH_2(exactn, 0);
3441 pending_exact = buf_end - 1;
3449 Bufbyte tmp_buf[MAX_EMCHAR_LEN];
3453 set_charptr_emchar(tmp_buf, c);
3455 for (i = 0; i < bt_count; i++) {
3456 BUF_PUSH(tmp_buf[i]);
3464 } /* while p != pend */
3466 /* Through the pattern now. */
3468 if (fixup_alt_jump) {
3469 STORE_JUMP(jump_past_alt, fixup_alt_jump, buf_end);
3471 if (!COMPILE_STACK_EMPTY) {
3475 /* If we don't want backtracking, force success
3476 the first time we reach the end of the compiled pattern. */
3477 if (syntax & RE_NO_POSIX_BACKTRACKING) {
3480 xfree(compile_stack.stack);
3482 /* We have succeeded; set the length of the buffer. */
3483 bufp->used = buf_end - bufp->buffer;
3485 #ifdef REGEX_DEBUG_FLAG
3487 DEBUG_PRINT1("\nCompiled pattern: \n");
3488 print_compiled_pattern(bufp);
3492 #ifndef MATCH_MAY_ALLOCATE
3493 /* Initialize the failure stack to the largest possible stack. This
3494 isn't necessary unless we're trying to avoid calling alloca in
3495 the search and match routines. */
3497 int num_regs = bufp->re_ngroups + 1;
3499 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
3500 is strictly greater than re_max_failures, the largest possible stack
3501 is 2 * re_max_failures failure points. */
3502 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS)) {
3504 (2 * re_max_failures * MAX_FAILURE_ITEMS);
3506 if (!fail_stack.stack) {
3508 (fail_stack_elt_t *)
3511 sizeof(fail_stack_elt_t));
3514 (fail_stack_elt_t *)
3515 xrealloc(fail_stack.stack,
3517 sizeof(fail_stack_elt_t)));
3521 regex_grow_registers(num_regs);
3523 #endif /* not MATCH_MAY_ALLOCATE */
3529 /* Subroutines for `regex_compile'. */
3531 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3533 static void store_op1(re_opcode_t op, unsigned char *loc, int arg)
3535 *loc = (unsigned char)op;
3536 STORE_NUMBER(loc + 1, arg);
3539 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3541 static void store_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2)
3543 *loc = (unsigned char)op;
3544 STORE_NUMBER(loc + 1, arg1);
3545 STORE_NUMBER(loc + 3, arg2);
3548 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3549 for OP followed by two-byte integer parameter ARG. */
3552 insert_op1(re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
3554 REGISTER unsigned char *pfrom = end;
3555 REGISTER unsigned char *pto = end + 3;
3557 while (pfrom != loc)
3560 store_op1(op, loc, arg);
3563 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3566 insert_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2,
3569 REGISTER unsigned char *pfrom = end;
3570 REGISTER unsigned char *pto = end + 5;
3572 while (pfrom != loc)
3575 store_op2(op, loc, arg1, arg2);
3578 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3579 after an alternative or a begin-subexpression. We assume there is at
3580 least one character before the ^. */
3583 at_begline_loc_p(re_char *pattern, re_char *p, reg_syntax_t syntax)
3585 re_char *prev = p - 2;
3586 re_bool prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3589 /* After a subexpression? */
3590 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3591 /* After an alternative? */
3593 && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3596 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3597 at least one character after the $, i.e., `P < PEND'. */
3600 at_endline_loc_p(re_char *p, re_char *pend, int syntax)
3603 re_bool next_backslash = *next == '\\';
3604 re_char *next_next = p + 1 < pend ? p + 1 : 0;
3607 /* Before a subexpression? */
3608 (syntax & RE_NO_BK_PARENS ? *next == ')'
3609 : next_backslash && next_next && *next_next == ')')
3610 /* Before an alternative? */
3611 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3612 : next_backslash && next_next && *next_next == '|');
3615 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3616 false if it's not. */
3619 group_in_compile_stack(compile_stack_type compile_stack, regnum_t regnum)
3623 for (this_element = compile_stack.avail - 1;
3624 this_element >= 0; this_element--)
3625 if (compile_stack.stack[this_element].regnum == regnum) {
3631 /* Read the ending character of a range (in a bracket expression) from the
3632 uncompiled pattern *P_PTR (which ends at PEND). We assume the
3633 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
3634 Then we set the translation of all bits between the starting and
3635 ending characters (inclusive) in the compiled pattern B.
3637 Return an error code.
3639 We use these short variable names so we can use the same macros as
3640 `regex_compile' itself. */
3642 static reg_errcode_t
3643 compile_range(re_char ** p_ptr, re_char * pend, RE_TRANSLATE_TYPE translate,
3644 reg_syntax_t syntax, unsigned char *buf_end)
3646 Element_count this_char;
3648 re_char *p = *p_ptr;
3649 int range_start, range_end;
3654 /* Even though the pattern is a signed `char *', we need to fetch
3655 with unsigned char *'s; if the high bit of the pattern character
3656 is set, the range endpoints will be negative if we fetch using a
3659 We also want to fetch the endpoints without translating them; the
3660 appropriate translation is done in the bit-setting loop below. */
3661 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
3662 range_start = ((const unsigned char *)p)[-2];
3663 range_end = ((const unsigned char *)p)[0];
3665 /* Have to increment the pointer into the pattern string, so the
3666 caller isn't still at the ending character. */
3669 /* If the start is after the end, the range is empty. */
3670 if (range_start > range_end)
3671 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3673 /* Here we see why `this_char' has to be larger than an `unsigned
3674 char' -- the range is inclusive, so if `range_end' == 0xff
3675 (assuming 8-bit characters), we would otherwise go into an infinite
3676 loop, since all characters <= 0xff. */
3677 for (this_char = range_start; this_char <= range_end; this_char++) {
3678 SET_LIST_BIT(TRANSLATE(this_char));
3686 static reg_errcode_t
3687 compile_extended_range(re_char ** p_ptr, re_char * pend,
3688 RE_TRANSLATE_TYPE translate,
3689 reg_syntax_t syntax, Lisp_Object rtab)
3691 Emchar this_char, range_start, range_end;
3697 p = (const Bufbyte *)*p_ptr;
3698 range_end = charptr_emchar(p);
3699 p--; /* back to '-' */
3700 DEC_CHARPTR(p); /* back to start of range */
3701 /* We also want to fetch the endpoints without translating them; the
3702 appropriate translation is done in the bit-setting loop below. */
3703 range_start = charptr_emchar(p);
3704 INC_CHARPTR(*p_ptr);
3706 /* If the start is after the end, the range is empty. */
3707 if (range_start > range_end)
3708 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3710 /* Can't have ranges spanning different charsets, except maybe for
3711 ranges entirely within the first 256 chars. */
3713 if ((range_start >= 0x100 || range_end >= 0x100)
3714 && CHAR_LEADING_BYTE(range_start) != CHAR_LEADING_BYTE(range_end))
3715 return REG_ERANGESPAN;
3717 /* As advertised, translations only work over the 0 - 0x7F range.
3718 Making this kind of stuff work generally is much harder.
3719 Iterating over the whole range like this would be way efficient
3720 if the range encompasses 10,000 chars or something. You'd have
3721 to do something like this:
3725 map over translation table in [range_start, range_end] of
3726 (put the mapped range in a;
3727 put the translation in b)
3728 invert the range in a and truncate to [range_start, range_end]
3729 compute the union of a, b
3730 union the result into rtab
3732 for (this_char = range_start;
3733 this_char <= range_end && this_char < 0x80; this_char++) {
3734 SET_RANGETAB_BIT(TRANSLATE(this_char));
3737 if (this_char <= range_end)
3738 put_range_table(rtab, this_char, range_end, Qt);
3745 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3746 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3747 characters can start a string that matches the pattern. This fastmap
3748 is used by re_search to skip quickly over impossible starting points.
3750 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3751 area as BUFP->fastmap.
3753 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3756 Returns 0 if we succeed, -2 if an internal error. */
3758 int re_compile_fastmap(struct re_pattern_buffer *bufp)
3761 #ifdef MATCH_MAY_ALLOCATE
3762 fail_stack_type fail_stack;
3764 DECLARE_DESTINATION;
3765 /* We don't push any register information onto the failure stack. */
3767 REGISTER char *fastmap = bufp->fastmap;
3768 unsigned char *pattern = bufp->buffer;
3769 unsigned long size = bufp->used;
3770 /* actually p supposed to carry the const qualifier, however, some
3771 silly mule code below CHANGES p and hence we cant go with the const
3772 qualifier here, leaving an `unfixable' warning on the way */
3774 unsigned char *p = pattern;
3776 re_char *p = pattern;
3778 REGISTER unsigned char *pend = pattern + size;
3781 /* This holds the pointer to the failure stack, when
3782 it is allocated relocatably. */
3783 fail_stack_elt_t *failure_stack_ptr;
3786 /* Assume that each path through the pattern can be null until
3787 proven otherwise. We set this false at the bottom of switch
3788 statement, to which we get only if a particular path doesn't
3789 match the empty string. */
3790 re_bool path_can_be_null = true;
3792 /* We aren't doing a `succeed_n' to begin with. */
3793 re_bool succeed_n_p = false;
3795 assert(fastmap != NULL && p != NULL);
3798 memset(fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3799 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3800 bufp->can_be_null = 0;
3803 if (p == pend || *p == succeed) {
3804 /* We have reached the (effective) end of pattern. */
3805 if (!FAIL_STACK_EMPTY()) {
3806 bufp->can_be_null |= path_can_be_null;
3808 /* Reset for next path. */
3809 path_can_be_null = true;
3811 /* fuck, p isnt const thanks to that
3812 * unified range table function below */
3814 p = (unsigned char*)fail_stack.
3815 stack[--fail_stack.avail].pointer;
3817 p = fail_stack.stack[--fail_stack.avail]
3827 /* We should never be about to go beyond the end of the pattern. */
3830 switch (SWITCH_ENUM_CAST((re_opcode_t) * p++)) {
3832 /* I guess the idea here is to simply not bother with a
3833 fastmap if a backreference is used, since it's too
3834 hard to figure out the fastmap for the corresponding
3835 group. Setting `can_be_null' stops `re_search_2'
3836 from using the fastmap, so that is all we do. */
3838 bufp->can_be_null = 1;
3841 /* Following are the cases which match a character.
3842 These end with `break'. */
3849 /* XEmacs: Under Mule, these bit vectors will
3850 only contain values for characters below 0x80. */
3851 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3852 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3857 /* Chars beyond end of map must be allowed. */
3859 for (j = *p * BYTEWIDTH; j < 0x80; j++)
3861 /* And all extended characters must be allowed, too. */
3862 for (j = 0x80; j < 0xA0; j++)
3864 #else /* not MULE */
3865 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3869 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3871 (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3876 case charset_mule: {
3880 nentries = unified_range_table_nentries(p);
3881 for (i = 0; i < nentries; i++) {
3882 EMACS_INT first, last;
3883 Lisp_Object dummy_val;
3885 Bufbyte strr[MAX_EMCHAR_LEN];
3887 unified_range_table_get_range(p, i,
3892 jj <= last && jj < 0x80; jj++)
3894 /* Ranges below 0x100 can span charsets, but there
3895 are only two (Control-1 and Latin-1), and
3896 either first or last has to be in them. */
3897 set_charptr_emchar(strr, first);
3900 set_charptr_emchar(strr, last);
3907 case charset_mule_not: {
3911 nentries = unified_range_table_nentries(p);
3912 for (i = 0; i < nentries; i++) {
3913 EMACS_INT first, last;
3914 Lisp_Object dummy_val;
3916 int smallest_prev = 0;
3918 unified_range_table_get_range(p, i,
3922 for (jj = smallest_prev;
3923 jj < first && jj < 0x80; jj++)
3925 smallest_prev = last + 1;
3926 if (smallest_prev >= 0x80)
3929 /* Calculating which leading bytes are actually allowed
3930 here is rather difficult, so we just punt and allow
3932 for (i = 0x80; i < 0xA0; i++)
3943 for (j = 0; j < (1 << BYTEWIDTH); j++)
3946 (regex_emacs_buffer->mirror_syntax_table),
3955 goto matchnotsyntax;
3957 for (j = 0; j < (1 << BYTEWIDTH); j++)
3960 (regex_emacs_buffer->mirror_syntax_table),
3967 int fastmap_newline = fastmap['\n'];
3969 /* `.' matches anything ... */
3971 /* "anything" only includes bytes that can be the
3972 first byte of a character. */
3973 for (j = 0; j < 0xA0; j++)
3976 for (j = 0; j < (1 << BYTEWIDTH); j++)
3980 /* ... except perhaps newline. */
3981 if (!(bufp->syntax & RE_DOT_NEWLINE))
3982 fastmap['\n'] = fastmap_newline;
3984 /* Return if we have already set `can_be_null'; if we
3985 have, then the fastmap is irrelevant. Something's
3987 else if (bufp->can_be_null)
3990 /* Otherwise, have to check alternative paths. */
4001 /* This match depends on text properties. These end with
4002 aborting optimizations. */
4003 bufp->can_be_null = 1;
4009 for (j = 0; j < 0x80; j++)
4012 (regex_emacs_buffer->mirror_syntax_table),
4013 j) == (enum syntaxcode)k)
4015 for (j = 0x80; j < 0xA0; j++) {
4016 if (LEADING_BYTE_PREFIX_P(j))
4017 /* too complicated to calculate this right */
4023 cset = CHARSET_BY_LEADING_BYTE(j);
4024 if (CHARSETP(cset)) {
4026 (regex_emacs_buffer, cset,
4028 == Sword || multi_p)
4033 #else /* not MULE */
4034 for (j = 0; j < (1 << BYTEWIDTH); j++)
4037 (regex_emacs_buffer->mirror_syntax_table),
4038 j) == (enum syntaxcode)k)
4045 for (j = 0; j < 0x80; j++)
4048 (regex_emacs_buffer->mirror_syntax_table),
4049 j) != (enum syntaxcode)k)
4051 for (j = 0x80; j < 0xA0; j++) {
4052 if (LEADING_BYTE_PREFIX_P(j))
4053 /* too complicated to calculate this right */
4059 cset = CHARSET_BY_LEADING_BYTE(j);
4060 if (CHARSETP(cset)) {
4062 (regex_emacs_buffer, cset,
4064 != Sword || multi_p)
4069 #else /* not MULE */
4070 for (j = 0; j < (1 << BYTEWIDTH); j++)
4073 (regex_emacs_buffer->mirror_syntax_table),
4074 j) != (enum syntaxcode)k)
4081 /* 97/2/17 jhod category patch */
4083 case notcategoryspec:
4084 bufp->can_be_null = 1;
4086 /* end if category patch */
4089 /* All cases after this match the empty string. These
4090 end with `continue'. */
4109 case push_dummy_failure:
4113 case pop_failure_jump:
4114 case maybe_pop_jump:
4117 case dummy_failure_jump:
4118 EXTRACT_NUMBER_AND_INCR(j, p);
4123 /* Jump backward implies we just went through the body
4124 of a loop and matched nothing. Opcode jumped to
4125 should be `on_failure_jump' or `succeed_n'. Just
4126 treat it like an ordinary jump. For a * loop, it has
4127 pushed its failure point already; if so, discard that
4129 if ((re_opcode_t) * p != on_failure_jump
4130 && (re_opcode_t) * p != succeed_n)
4134 EXTRACT_NUMBER_AND_INCR(j, p);
4137 /* If what's on the stack is where we are now, pop
4139 if (!FAIL_STACK_EMPTY()
4140 && fail_stack.stack[fail_stack.avail - 1].pointer ==
4146 case on_failure_jump:
4147 case on_failure_keep_string_jump:
4148 handle_on_failure_jump:
4149 EXTRACT_NUMBER_AND_INCR(j, p);
4151 /* For some patterns, e.g., `(a?)?', `p+j' here points
4152 to the end of the pattern. We don't want to push
4153 such a point, since when we restore it above,
4154 entering the switch will increment `p' past the end
4155 of the pattern. We don't need to push such a point
4156 since we obviously won't find any more fastmap
4157 entries beyond `pend'. Such a pattern can match the
4158 null string, though. */
4160 if (!PUSH_PATTERN_OP(p + j, fail_stack)) {
4165 bufp->can_be_null = 1;
4168 EXTRACT_NUMBER_AND_INCR(k, p); /* Skip the n. */
4169 succeed_n_p = false;
4175 /* Get to the number of times to succeed. */
4178 /* Increment p past the n for when k != 0. */
4179 EXTRACT_NUMBER_AND_INCR(k, p);
4182 succeed_n_p = true; /* Spaghetti code alert. */
4183 goto handle_on_failure_jump;
4198 abort(); /* We have listed all the cases. */
4201 /* Getting here means we have found the possible starting
4202 characters for one path of the pattern -- and that the empty
4203 string does not match. We need not follow this path further.
4204 Instead, look at the next alternative (remembered on the
4205 stack), or quit if no more. The test at the top of the loop
4206 does these things. */
4207 path_can_be_null = false;
4211 /* Set `can_be_null' for the last path (also the first path, if the
4212 pattern is empty). */
4213 bufp->can_be_null |= path_can_be_null;
4218 } /* re_compile_fastmap */
4220 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4221 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4222 this memory for recording register information. STARTS and ENDS
4223 must be allocated using the malloc library routine, and must each
4224 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4226 If NUM_REGS == 0, then subsequent matches should allocate their own
4229 Unless this function is called, the first search or match using
4230 PATTERN_BUFFER will allocate its own register data, without
4231 freeing the old data. */
4234 re_set_registers(struct re_pattern_buffer *bufp, struct re_registers *regs,
4235 unsigned num_regs, regoff_t * starts, regoff_t * ends)
4238 bufp->regs_allocated = REGS_REALLOCATE;
4239 regs->num_regs = num_regs;
4240 regs->start = starts;
4243 bufp->regs_allocated = REGS_UNALLOCATED;
4245 regs->start = regs->end = (regoff_t *) 0;
4249 /* Searching routines. */
4251 /* Like re_search_2, below, but only one string is specified, and
4252 doesn't let you say where to stop matching. */
4255 re_search(struct re_pattern_buffer *bufp, const char *string, int size,
4256 int startpos, int range, struct re_registers *regs)
4258 return re_search_2(bufp, NULL, 0, string, size, startpos, range,
4263 /* Snarfed from src/lisp.h, needed for compiling [ce]tags. */
4264 # define bytecount_to_charcount(ptr, len) (len)
4265 # define charcount_to_bytecount(ptr, len) (len)
4266 typedef int Charcount;
4269 /* Using the compiled pattern in BUFP->buffer, first tries to match the
4270 virtual concatenation of STRING1 and STRING2, starting first at index
4271 STARTPOS, then at STARTPOS + 1, and so on.
4273 With MULE, STARTPOS is a byte position, not a char position. And the
4274 search will increment STARTPOS by the width of the current leading
4277 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
4279 RANGE is how far to scan while trying to match. RANGE = 0 means try
4280 only at STARTPOS; in general, the last start tried is STARTPOS +
4283 With MULE, RANGE is a byte position, not a char position. The last
4284 start tried is the character starting <= STARTPOS + RANGE.
4286 In REGS, return the indices of the virtual concatenation of STRING1
4287 and STRING2 that matched the entire BUFP->buffer and its contained
4290 Do not consider matching one past the index STOP in the virtual
4291 concatenation of STRING1 and STRING2.
4293 We return either the position in the strings at which the match was
4294 found, -1 if no match, or -2 if error (such as failure
4298 re_search_2(struct re_pattern_buffer *bufp, const char *str1,
4299 int size1, const char *str2, int size2, int startpos,
4300 int range, struct re_registers *regs, int stop)
4303 re_char *string1 = (re_char *) str1;
4304 re_char *string2 = (re_char *) str2;
4305 REGISTER char *fastmap = bufp->fastmap;
4306 REGISTER RE_TRANSLATE_TYPE translate = bufp->translate;
4307 int total_size = size1 + size2;
4308 int endpos = startpos + range;
4309 #ifdef REGEX_BEGLINE_CHECK
4310 int anchored_at_begline = 0;
4315 /* Check for out-of-range STARTPOS. */
4316 if (startpos < 0 || startpos > total_size)
4319 /* Fix up RANGE if it might eventually take us outside
4320 the virtual concatenation of STRING1 and STRING2. */
4322 range = 0 - startpos;
4323 else if (endpos > total_size)
4324 range = total_size - startpos;
4326 /* If the search isn't to be a backwards one, don't waste time in a
4327 search for a pattern that must be anchored. */
4328 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf
4333 d = ((const unsigned char *)
4335 size1 ? string2 - size1 : string1) + startpos);
4336 range = charcount_to_bytecount(d, 1);
4340 /* In a forward search for something that starts with \=.
4341 don't keep searching past point. */
4342 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot
4345 BUF_PT(regex_emacs_buffer) - BUF_BEGV(regex_emacs_buffer)
4352 /* Update the fastmap now if not correct already. */
4353 if (fastmap && !bufp->fastmap_accurate)
4354 if (re_compile_fastmap(bufp) == -2)
4357 #ifdef REGEX_BEGLINE_CHECK
4359 unsigned long i = 0;
4361 while (i < bufp->used) {
4362 if (bufp->buffer[i] == start_memory ||
4363 bufp->buffer[i] == stop_memory)
4368 anchored_at_begline = i < bufp->used
4369 && bufp->buffer[i] == begline;
4374 SETUP_SYNTAX_CACHE_FOR_OBJECT(regex_match_object,
4376 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR
4377 (regex_match_object, regex_emacs_buffer,
4381 /* Loop through the string, looking for a place to start matching. */
4383 #ifdef REGEX_BEGLINE_CHECK
4384 /* If the regex is anchored at the beginning of a line (i.e. with a ^),
4385 then we can speed things up by skipping to the next beginning-of-
4387 if (anchored_at_begline && startpos > 0 && startpos != size1 &&
4389 /* whose stupid idea was it anyway to make this
4390 function take two strings to match?? */
4394 if (startpos < size1 && startpos + range >= size1)
4395 lim = range - (size1 - startpos);
4397 d = ((const unsigned char *)
4399 size1 ? string2 - size1 : string1) + startpos);
4400 DEC_CHARPTR(d); /* Ok, since startpos != size1. */
4401 d_size = charcount_to_bytecount(d, 1);
4403 if (TRANSLATE_P(translate))
4404 while (range > lim && *d != '\n') {
4405 d += d_size; /* Speedier INC_CHARPTR(d) */
4406 d_size = charcount_to_bytecount(d, 1);
4409 while (range > lim && *d != '\n') {
4410 d += d_size; /* Speedier INC_CHARPTR(d) */
4411 d_size = charcount_to_bytecount(d, 1);
4415 startpos += irange - range;
4417 #endif /* REGEX_BEGLINE_CHECK */
4419 /* If a fastmap is supplied, skip quickly over characters that
4420 cannot be the start of a match. If the pattern can match the
4421 null string, however, we don't need to skip characters; we want
4422 the first null string. */
4423 if (fastmap && startpos < total_size && !bufp->can_be_null) {
4424 if (range > 0) { /* Searching forwards. */
4428 if (startpos < size1
4429 && startpos + range >= size1)
4430 lim = range - (size1 - startpos);
4432 d = ((const unsigned char *)
4434 size1 ? string2 - size1 : string1) +
4437 /* Written out as an if-else to avoid testing `translate'
4439 if (TRANSLATE_P(translate))
4440 while (range > lim) {
4443 Bufbyte str[MAX_EMCHAR_LEN];
4445 buf_ch = charptr_emchar(d);
4446 buf_ch = RE_TRANSLATE(buf_ch);
4447 set_charptr_emchar(str, buf_ch);
4449 || fastmap[(unsigned char)
4459 charcount_to_bytecount(d,
4462 d += d_size; /* Speedier INC_CHARPTR(d) */
4464 while (range > lim && !fastmap[*d]) {
4466 charcount_to_bytecount(d,
4469 d += d_size; /* Speedier INC_CHARPTR(d) */
4472 startpos += irange - range;
4473 } else { /* Searching backwards. */
4475 Emchar c = (size1 == 0 || startpos >= size1
4476 ? charptr_emchar(string2 +
4478 : charptr_emchar(string1 +
4482 if (!(c >= 0200 || fastmap[(unsigned char)c]))
4485 if (!fastmap[(unsigned char)c])
4491 /* If can't match the null string, and that's all we have left, fail. */
4492 if (range >= 0 && startpos == total_size && fastmap
4493 && !bufp->can_be_null)
4496 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4497 if (!no_quit_in_re_search)
4500 val = re_match_2_internal(bufp, string1, size1, string2, size2,
4501 startpos, regs, stop);
4502 #ifndef REGEX_MALLOC
4517 else if (range > 0) {
4518 d = ((const unsigned char *)
4520 size1 ? string2 - size1 : string1) + startpos);
4521 d_size = charcount_to_bytecount(d, 1);
4525 /* Note startpos > size1 not >=. If we are on the
4526 string1/string2 boundary, we want to backup into string1. */
4527 d = ((const unsigned char *)
4529 size1 ? string2 - size1 : string1) + startpos);
4531 d_size = charcount_to_bytecount(d, 1);
4539 /* Declarations and macros for re_match_2. */
4541 /* This converts PTR, a pointer into one of the search strings `string1'
4542 and `string2' into an offset from the beginning of that string. */
4543 #define POINTER_TO_OFFSET(ptr) \
4544 (FIRST_STRING_P (ptr) \
4545 ? ((regoff_t) ((ptr) - string1)) \
4546 : ((regoff_t) ((ptr) - string2 + size1)))
4548 /* Macros for dealing with the split strings in re_match_2. */
4550 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
4552 /* Call before fetching a character with *d. This switches over to
4553 string2 if necessary. */
4554 #define REGEX_PREFETCH() \
4557 /* End of string2 => fail. */ \
4558 if (dend == end_match_2) \
4560 /* End of string1 => advance to string2. */ \
4562 dend = end_match_2; \
4565 /* Test if at very beginning or at very end of the virtual concatenation
4566 of `string1' and `string2'. If only one string, it's `string2'. */
4567 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4568 #define AT_STRINGS_END(d) ((d) == end2)
4571 If the given position straddles the string gap, return the equivalent
4572 position that is before or after the gap, respectively; otherwise,
4573 return the same position. */
4574 #define POS_BEFORE_GAP_UNSAFE(d) ((d) == string2 ? end1 : (d))
4575 #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d))
4577 /* Test if CH is a word-constituent character. (XEmacs change) */
4578 #define WORDCHAR_P_UNSAFE(ch) \
4579 (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), \
4582 /* Free everything we malloc. */
4583 #ifdef MATCH_MAY_ALLOCATE
4584 #define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
4585 #define FREE_VARIABLES() \
4587 REGEX_FREE_STACK (fail_stack.stack); \
4588 FREE_VAR (regstart); \
4589 FREE_VAR (regend); \
4590 FREE_VAR (old_regstart); \
4591 FREE_VAR (old_regend); \
4592 FREE_VAR (best_regstart); \
4593 FREE_VAR (best_regend); \
4594 FREE_VAR (reg_info); \
4595 FREE_VAR (reg_dummy); \
4596 FREE_VAR (reg_info_dummy); \
4598 #else /* not MATCH_MAY_ALLOCATE */
4599 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4600 #endif /* MATCH_MAY_ALLOCATE */
4602 /* These values must meet several constraints. They must not be valid
4603 register values; since we have a limit of 255 registers (because
4604 we use only one byte in the pattern for the register number), we can
4605 use numbers larger than 255. They must differ by 1, because of
4606 NUM_FAILURE_ITEMS above. And the value for the lowest register must
4607 be larger than the value for the highest register, so we do not try
4608 to actually save any registers when none are active. */
4609 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
4610 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
4612 /* Matching routines. */
4614 #ifndef emacs /* Emacs never uses this. */
4615 /* re_match is like re_match_2 except it takes only a single string. */
4618 re_match(struct re_pattern_buffer *bufp, const char *string, int size,
4619 int pos, struct re_registers *regs)
4622 re_match_2_internal(bufp, NULL, 0, (re_char *) string, size,
4627 #endif /* not emacs */
4629 /* re_match_2 matches the compiled pattern in BUFP against the
4630 (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 and
4631 SIZE2, respectively). We start matching at POS, and stop matching
4634 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4635 store offsets for the substring each group matched in REGS. See the
4636 documentation for exactly how many groups we fill.
4638 We return -1 if no match, -2 if an internal error (such as the
4639 failure stack overflowing). Otherwise, we return the length of the
4640 matched substring. */
4643 re_match_2(struct re_pattern_buffer *bufp, const char *string1,
4644 int size1, const char *string2, int size2, int pos,
4645 struct re_registers *regs, int stop)
4650 SETUP_SYNTAX_CACHE_FOR_OBJECT(regex_match_object,
4652 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR
4653 (regex_match_object, regex_emacs_buffer,
4657 result = re_match_2_internal(bufp, (re_char *) string1, size1,
4658 (re_char *) string2, size2,
4665 /* This is a separate function so that we can force an alloca cleanup
4668 re_match_2_internal(struct re_pattern_buffer *bufp, re_char * string1,
4669 int size1, re_char * string2, int size2, int pos,
4670 struct re_registers *regs, int stop)
4672 /* General temporaries. */
4675 int should_succeed; /* XEmacs change */
4677 /* Just past the end of the corresponding string. */
4678 re_char *end1, *end2;
4680 /* Pointers into string1 and string2, just past the last characters in
4681 each to consider matching. */
4682 re_char *end_match_1, *end_match_2;
4684 /* Where we are in the data, and the end of the current string. */
4685 const re_char *d, *dend;
4687 /* Where we are in the pattern, and the end of the pattern. */
4688 unsigned char *p = bufp->buffer;
4689 REGISTER unsigned char *pend = p + bufp->used;
4691 /* Mark the opcode just after a start_memory, so we can test for an
4692 empty subpattern when we get to the stop_memory. */
4693 re_char *just_past_start_mem = 0;
4695 /* We use this to map every character in the string. */
4696 RE_TRANSLATE_TYPE translate = bufp->translate;
4698 /* Failure point stack. Each place that can handle a failure further
4699 down the line pushes a failure point on this stack. It consists of
4700 restart, regend, and reg_info for all registers corresponding to
4701 the subexpressions we're currently inside, plus the number of such
4702 registers, and, finally, two char *'s. The first char * is where
4703 to resume scanning the pattern; the second one is where to resume
4704 scanning the strings. If the latter is zero, the failure point is
4705 a ``dummy''; if a failure happens and the failure point is a dummy,
4706 it gets discarded and the next one is tried. */
4707 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4708 fail_stack_type fail_stack;
4710 #ifdef REGEX_DEBUG_FLAG
4711 static unsigned failure_id;
4712 int nfailure_points_pushed = 0, nfailure_points_popped = 0;
4716 /* This holds the pointer to the failure stack, when
4717 it is allocated relocatably. */
4718 fail_stack_elt_t *failure_stack_ptr;
4721 /* We fill all the registers internally, independent of what we
4722 return, for use in backreferences. The number here includes
4723 an element for register zero. */
4724 int num_regs = bufp->re_ngroups + 1;
4726 /* The currently active registers. */
4727 int lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4728 int highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4730 /* Information on the contents of registers. These are pointers into
4731 the input strings; they record just what was matched (on this
4732 attempt) by a subexpression part of the pattern, that is, the
4733 regnum-th regstart pointer points to where in the pattern we began
4734 matching and the regnum-th regend points to right after where we
4735 stopped matching the regnum-th subexpression. (The zeroth register
4736 keeps track of what the whole pattern matches.) */
4737 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4738 re_char **regstart, **regend;
4741 /* If a group that's operated upon by a repetition operator fails to
4742 match anything, then the register for its start will need to be
4743 restored because it will have been set to wherever in the string we
4744 are when we last see its open-group operator. Similarly for a
4746 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4747 re_char **old_regstart, **old_regend;
4750 /* The is_active field of reg_info helps us keep track of which (possibly
4751 nested) subexpressions we are currently in. The matched_something
4752 field of reg_info[reg_num] helps us tell whether or not we have
4753 matched any of the pattern so far this time through the reg_num-th
4754 subexpression. These two fields get reset each time through any
4755 loop their register is in. */
4756 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4757 register_info_type *reg_info;
4760 /* The following record the register info as found in the above
4761 variables when we find a match better than any we've seen before.
4762 This happens as we backtrack through the failure points, which in
4763 turn happens only if we have not yet matched the entire string. */
4764 unsigned best_regs_set = false;
4765 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4766 re_char **best_regstart, **best_regend;
4769 /* Logically, this is `best_regend[0]'. But we don't want to have to
4770 allocate space for that if we're not allocating space for anything
4771 else (see below). Also, we never need info about register 0 for
4772 any of the other register vectors, and it seems rather a kludge to
4773 treat `best_regend' differently than the rest. So we keep track of
4774 the end of the best match so far in a separate variable. We
4775 initialize this to NULL so that when we backtrack the first time
4776 and need to test it, it's not garbage. */
4777 re_char *match_end = NULL;
4779 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
4780 int set_regs_matched_done = 0;
4782 /* Used when we pop values we don't care about. */
4783 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4784 re_char **reg_dummy;
4785 register_info_type *reg_info_dummy;
4788 #ifdef REGEX_DEBUG_FLAG
4789 /* Counts the total number of registers pushed. */
4790 unsigned num_regs_pushed = 0;
4793 /* 1 if this match ends in the same string (string1 or string2)
4794 as the best previous match. */
4797 /* 1 if this match is the best seen so far. */
4798 re_bool best_match_p;
4800 DEBUG_PRINT1("\n\nEntering re_match_2.\n");
4804 #ifdef MATCH_MAY_ALLOCATE
4805 /* Do not bother to initialize all the register variables if there are
4806 no groups in the pattern, as it takes a fair amount of time. If
4807 there are groups, we include space for register 0 (the whole
4808 pattern), even though we never use it, since it simplifies the
4809 array indexing. We should fix this. */
4810 if (bufp->re_ngroups) {
4811 regstart = REGEX_TALLOC(num_regs, re_char *);
4812 regend = REGEX_TALLOC(num_regs, re_char *);
4813 old_regstart = REGEX_TALLOC(num_regs, re_char *);
4814 old_regend = REGEX_TALLOC(num_regs, re_char *);
4815 best_regstart = REGEX_TALLOC(num_regs, re_char *);
4816 best_regend = REGEX_TALLOC(num_regs, re_char *);
4817 reg_info = REGEX_TALLOC(num_regs, register_info_type);
4818 reg_dummy = REGEX_TALLOC(num_regs, re_char *);
4819 reg_info_dummy = REGEX_TALLOC(num_regs, register_info_type);
4822 (regstart && regend && old_regstart && old_regend
4823 && reg_info && best_regstart && best_regend && reg_dummy
4824 && reg_info_dummy)) {
4829 /* We must initialize all our variables to NULL, so that
4830 `FREE_VARIABLES' doesn't try to free them. */
4831 regstart = regend = old_regstart = old_regend = best_regstart
4832 = best_regend = reg_dummy = NULL;
4833 reg_info = reg_info_dummy = (register_info_type *) NULL;
4835 #endif /* MATCH_MAY_ALLOCATE */
4837 /* The starting position is bogus. */
4838 if (pos < 0 || pos > size1 + size2) {
4843 /* Initialize subexpression text positions to -1 to mark ones that no
4844 start_memory/stop_memory has been seen for. Also initialize the
4845 register information struct. */
4846 for (mcnt = 1; mcnt < num_regs; mcnt++) {
4847 regstart[mcnt] = regend[mcnt]
4848 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
4850 REG_MATCH_NULL_STRING_P(reg_info[mcnt]) =
4851 MATCH_NULL_UNSET_VALUE;
4852 IS_ACTIVE(reg_info[mcnt]) = 0;
4853 MATCHED_SOMETHING(reg_info[mcnt]) = 0;
4854 EVER_MATCHED_SOMETHING(reg_info[mcnt]) = 0;
4856 /* We move `string1' into `string2' if the latter's empty -- but not if
4857 `string1' is null. */
4858 if (size2 == 0 && string1 != NULL) {
4864 end1 = string1 + size1;
4865 end2 = string2 + size2;
4867 /* Compute where to stop matching, within the two strings. */
4868 if (stop <= size1) {
4869 end_match_1 = string1 + stop;
4870 end_match_2 = string2;
4873 end_match_2 = string2 + stop - size1;
4876 /* `p' scans through the pattern as `d' scans through the data.
4877 `dend' is the end of the input string that `d' points within. `d'
4878 is advanced into the following input string whenever necessary, but
4879 this happens before fetching; therefore, at the beginning of the
4880 loop, `d' can be pointing at the end of a string, but it cannot
4882 if (size1 > 0 && pos <= size1) {
4886 d = string2 + pos - size1;
4890 DEBUG_PRINT1("The compiled pattern is: \n");
4891 DEBUG_PRINT_COMPILED_PATTERN(bufp, p, pend);
4892 DEBUG_PRINT1("The string to match is: `");
4893 DEBUG_PRINT_DOUBLE_STRING(d, string1, size1, string2, size2);
4894 DEBUG_PRINT1("'\n");
4896 /* This loops over pattern commands. It exits by returning from the
4897 function if the match is complete, or it drops through if the match
4898 fails at this starting point in the input data. */
4900 DEBUG_PRINT2("\n0x%lx: ", (long)p);
4901 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4902 if (!no_quit_in_re_search)
4907 /* End of pattern means we might have succeeded. */
4908 DEBUG_PRINT1("end of pattern ... ");
4910 /* If we haven't matched the entire string, and we want
4911 the longest match, try backtracking. */
4912 if (d != end_match_2) {
4913 same_str_p = (FIRST_STRING_P(match_end)
4914 == MATCHING_IN_FIRST_STRING);
4916 /* AIX compiler got confused when this was
4917 combined with the previous declaration. */
4919 best_match_p = d > match_end;
4922 !MATCHING_IN_FIRST_STRING;
4924 DEBUG_PRINT1("backtracking.\n");
4926 if (!FAIL_STACK_EMPTY()) {
4927 /* More failure points to try. */
4929 /* If exceeds best match so far, save
4931 if (!best_regs_set || best_match_p) {
4932 best_regs_set = true;
4936 ("\nSAVING match as best so far.\n");
4938 for (mcnt = 1; mcnt < num_regs;
4940 best_regstart[mcnt] =
4949 /* If no failure points, don't restore garbage.
4950 And if last match is real best match, don't
4951 restore second best one. */
4952 else if (best_regs_set && !best_match_p) {
4954 /* Restore best match. It may happen
4955 that `dend == end_match_1' while the
4956 restored d is in string2. For
4957 example, the pattern `x.*y.*z'
4958 against the strings `x-' and `y-z-',
4959 if the two strings are not
4960 consecutive in memory. */
4962 ("Restoring best registers.\n");
4965 dend = ((d >= string1 && d <= end1)
4966 ? end_match_1 : end_match_2);
4968 for (mcnt = 1; mcnt < num_regs; mcnt++) {
4970 best_regstart[mcnt];
4976 /* d != end_match_2 */
4978 DEBUG_PRINT1("Accepting match.\n");
4980 int num_nonshy_regs = bufp->re_nsub + 1;
4981 /* If caller wants register contents data back,
4983 if (regs && !bufp->no_sub) {
4984 /* Have the register data arrays been
4986 if (bufp->regs_allocated == REGS_UNALLOCATED) {
4987 /* No. So allocate them with malloc.
4988 We need one extra element beyond
4989 `num_regs' for the `-1' marker GNU
4991 regs->num_regs = MAX(RE_NREGS, num_nonshy_regs + 1);
4992 regs->start = TALLOC(regs->num_regs, regoff_t);
4993 regs->end = TALLOC(regs->num_regs, regoff_t);
4994 if (regs->start == NULL || regs->end == NULL) {
4998 bufp->regs_allocated = REGS_REALLOCATE;
4999 } else if (bufp->regs_allocated == REGS_REALLOCATE) {
5000 /* Yes. If we need more elements than were already
5001 allocated, reallocate them. If we need fewer, just
5003 if (regs->num_regs < num_nonshy_regs + 1) {
5004 regs->num_regs = num_nonshy_regs + 1;
5005 RETALLOC(regs->start, regs->num_regs, regoff_t);
5006 RETALLOC(regs->end, regs->num_regs, regoff_t);
5007 if (regs->start == NULL || regs->end == NULL) {
5013 /* The braces fend off a "empty body in an else-statement"
5014 warning under GCC when assert expands to nothing. */
5015 assert (bufp->regs_allocated == REGS_FIXED);
5018 /* Convert the pointer data in `regstart' and `regend' to
5019 indices. Register zero has to be set differently,
5020 since we haven't kept track of any info for it. */
5021 if (regs->num_regs > 0) {
5022 regs->start[0] = pos;
5023 regs->end[0] = (MATCHING_IN_FIRST_STRING
5024 ? ((regoff_t) (d - string1))
5025 : ((regoff_t) (d - string2 + size1)));
5028 /* Map over the NUM_NONSHY_REGS non-shy internal registers.
5029 Copy each into the corresponding external register.
5030 N.B. MCNT indexes external registers. */
5032 mcnt < MIN (num_nonshy_regs, regs->num_regs);
5034 int ireg = bufp->external_to_internal_register[mcnt];
5036 if (REG_UNSET (regstart[ireg]) || REG_UNSET (regend[ireg]))
5037 regs->start[mcnt] = regs->end[mcnt] = -1;
5040 = (regoff_t) POINTER_TO_OFFSET (regstart[ireg]);
5042 = (regoff_t) POINTER_TO_OFFSET (regend[ireg]);
5045 } /* regs && !bufp->no_sub */
5047 /* If we have regs and the regs structure has
5048 more elements than were in the pattern, set
5049 the extra elements to -1. If we
5050 (re)allocated the registers, this is the
5051 case, because we always allocate enough to
5052 have at least one -1 at the end.
5054 We do this even when no_sub is set because
5055 some applications (XEmacs) reuse register
5056 structures which may contain stale
5057 information, and permit attempts to access
5060 It would be possible to require the caller to
5061 do this, but we'd have to change the API for
5062 this function to reflect that, and audit all
5064 if (regs && regs->num_regs > 0)
5065 for (mcnt = num_nonshy_regs; mcnt < regs->num_regs; mcnt++)
5066 regs->start[mcnt] = regs->end[mcnt] = -1;
5069 DEBUG_PRINT4("%u failure points pushed, %u popped (%u remain).\n",
5070 nfailure_points_pushed, nfailure_points_popped,
5071 nfailure_points_pushed - nfailure_points_popped);
5072 DEBUG_PRINT2("%u registers pushed.\n", num_regs_pushed);
5074 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
5078 DEBUG_PRINT2("Returning %d from re_match_2.\n", mcnt);
5084 /* Otherwise match next pattern command. */
5085 switch (SWITCH_ENUM_CAST((re_opcode_t) *p++)) {
5086 /* Ignore these. Used to ignore the n of succeed_n's
5087 which currently have n == 0. */
5089 DEBUG_PRINT1("EXECUTING no_op.\n");
5093 DEBUG_PRINT1("EXECUTING succeed.\n");
5096 /* Match the next n pattern characters exactly. The
5097 following byte in the pattern defines n, and the n
5098 bytes after that are the characters to match. */
5101 DEBUG_PRINT2("EXECUTING exactn %d.\n", mcnt);
5103 /* This is written out as an if-else so we don't waste
5104 time testing `translate' inside the loop. */
5105 if (TRANSLATE_P(translate)) {
5108 Emchar pat_ch, buf_ch;
5112 pat_ch = charptr_emchar(p);
5113 buf_ch = charptr_emchar(d);
5114 if (RE_TRANSLATE(buf_ch) != pat_ch)
5117 pat_len = charcount_to_bytecount(p, 1);
5122 #else /* not MULE */
5124 if ((unsigned char)RE_TRANSLATE(*d++) !=
5142 /* Match any character except possibly a newline or a
5145 DEBUG_PRINT1("EXECUTING anychar.\n");
5149 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5150 && TRANSLATE(*d) == '\n')
5151 || (bufp->syntax & RE_DOT_NOT_NULL
5152 && TRANSLATE(*d) == '\000'))
5156 DEBUG_PRINT2(" Matched `%d'.\n", *d);
5157 INC_CHARPTR(d); /* XEmacs change */
5162 REGISTER unsigned char c;
5164 (re_opcode_t) * (p - 1) == charset_not;
5166 DEBUG_PRINT2("EXECUTING charset%s.\n",
5167 not_p ? "_not" : "");
5170 /* The character to match. */
5173 /* Cast to `unsigned' instead of `unsigned char'
5174 in case the bit list is a full 32 bytes
5176 if (c < (unsigned)(*p * BYTEWIDTH)
5178 c / BYTEWIDTH] & (1 << (c %
5188 INC_CHARPTR(d); /* XEmacs change */
5194 case charset_mule_not: {
5197 (re_opcode_t) * (p - 1) == charset_mule_not;
5199 DEBUG_PRINT2("EXECUTING charset_mule%s.\n",
5200 not_p ? "_not" : "");
5203 c = charptr_emchar((const Bufbyte *)d);
5204 /* The character to match. */
5209 unified_range_table_lookup(p, c, Qnil)))
5212 p += unified_range_table_bytes_used(p);
5223 /* The beginning of a group is represented by
5224 start_memory. The arguments are the register number
5225 in the next byte, and the number of groups inner to
5226 this one in the next. The text matched within the
5227 group is recorded (in the internal registers data
5228 structure) under the register number. */
5230 DEBUG_PRINT3("EXECUTING start_memory %d (%d):\n", *p,
5233 /* Find out if this group can match the empty string. */
5234 p1 = p; /* To send to group_match_null_string_p. */
5236 if (REG_MATCH_NULL_STRING_P(reg_info[*p]) ==
5237 MATCH_NULL_UNSET_VALUE)
5238 REG_MATCH_NULL_STRING_P(reg_info[*p])
5239 = group_match_null_string_p(&p1, pend,
5242 /* Save the position in the string where we were the
5243 last time we were at this open-group operator in case
5244 the group is operated upon by a repetition operator,
5245 e.g., with `(a*)*b' against `ab'; then we want to
5246 ignore where we are now in the string in case this
5247 attempt to match fails. */
5248 old_regstart[*p] = REG_MATCH_NULL_STRING_P(reg_info[*p])
5249 ? REG_UNSET(regstart[*p]) ? d : regstart[*p]
5251 DEBUG_PRINT2(" old_regstart: %d\n",
5252 POINTER_TO_OFFSET(old_regstart[*p]));
5255 DEBUG_PRINT2(" regstart: %d\n",
5256 POINTER_TO_OFFSET(regstart[*p]));
5258 IS_ACTIVE(reg_info[*p]) = 1;
5259 MATCHED_SOMETHING(reg_info[*p]) = 0;
5261 /* Clear this whenever we change the register activity
5263 set_regs_matched_done = 0;
5265 /* This is the new highest active register. */
5266 highest_active_reg = *p;
5268 /* If nothing was active before, this is the new lowest
5270 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5271 lowest_active_reg = *p;
5273 /* Move past the register number and inner group
5276 just_past_start_mem = p;
5280 /* The stop_memory opcode represents the end of a group.
5281 Its arguments are the same as start_memory's: the
5282 register number, and the number of inner groups. */
5284 DEBUG_PRINT3("EXECUTING stop_memory %d (%d):\n", *p,
5287 /* We need to save the string position the last time we
5288 were at this close-group operator in case the group
5289 is operated upon by a repetition operator, e.g., with
5290 `((a*)*(b*)*)*' against `aba'; then we want to ignore
5291 where we are now in the string in case this attempt
5293 old_regend[*p] = REG_MATCH_NULL_STRING_P(reg_info[*p])
5294 ? REG_UNSET(regend[*p]) ? d : regend[*p]
5296 DEBUG_PRINT2(" old_regend: %d\n",
5297 POINTER_TO_OFFSET(old_regend[*p]));
5300 DEBUG_PRINT2(" regend: %d\n",
5301 POINTER_TO_OFFSET(regend[*p]));
5303 /* This register isn't active anymore. */
5304 IS_ACTIVE(reg_info[*p]) = 0;
5306 /* Clear this whenever we change the register activity
5308 set_regs_matched_done = 0;
5310 /* If this was the only register active, nothing is
5312 if (lowest_active_reg == highest_active_reg) {
5313 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5314 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5315 } else { /* We must scan for the new highest
5316 active register, since it isn't
5317 necessarily one less than now:
5318 consider (a(b)c(d(e)f)g). When group
5319 3 ends, after the f), the new highest
5320 active register is 1. */
5321 unsigned char r = *p - 1;
5322 while (r > 0 && !IS_ACTIVE(reg_info[r]))
5325 /* If we end up at register zero, that means
5326 that we saved the registers as the result of
5327 an `on_failure_jump', not a `start_memory',
5328 and we jumped to past the innermost
5329 `stop_memory'. For example, in ((.)*) we
5330 save registers 1 and 2 as a result of the *,
5331 but when we pop back to the second ), we are
5332 at the stop_memory 1. Thus, nothing is
5336 NO_LOWEST_ACTIVE_REG;
5337 highest_active_reg =
5338 NO_HIGHEST_ACTIVE_REG;
5340 highest_active_reg = r;
5342 /* 98/9/21 jhod: We've also gotta set
5343 lowest_active_reg, don't we? */
5345 while (r < highest_active_reg
5346 && !IS_ACTIVE(reg_info[r]))
5348 lowest_active_reg = r;
5352 /* If just failed to match something this time around
5353 with a group that's operated on by a repetition
5354 operator, try to force exit from the ``loop'', and
5355 restore the register information for this group that
5356 we had before trying this last match. */
5357 if ((!MATCHED_SOMETHING(reg_info[*p])
5358 || just_past_start_mem == p - 1)
5359 && (p + 2) < pend) {
5360 re_bool is_a_jump_n = false;
5364 switch ((unsigned int)(re_opcode_t)*p1++) {
5367 case pop_failure_jump:
5368 case maybe_pop_jump:
5370 case dummy_failure_jump:
5371 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
5381 /* If the next operation is a jump backwards in
5382 the pattern to an on_failure_jump right
5383 before the start_memory corresponding to this
5384 stop_memory, exit from the loop by forcing a
5385 failure after pushing on the stack the
5386 on_failure_jump's jump in the pattern, and
5389 && (re_opcode_t) * p1 == on_failure_jump
5390 && (re_opcode_t) p1[3] == start_memory
5392 /* If this group ever matched anything,
5393 then restore what its registers were
5394 before trying this last failed match,
5395 e.g., with `(a*)*b' against `ab' for
5396 regstart[1], and, e.g., with
5397 `((a*)*(b*)*)*' against `aba' for
5400 Also restore the registers for inner
5401 groups for, e.g., `((a*)(b*))*'
5402 against `aba' (register 3 would
5403 otherwise get trashed). */
5405 if (EVER_MATCHED_SOMETHING
5409 EVER_MATCHED_SOMETHING(reg_info
5413 /* Restore this and inner
5416 for (r = *p; r < *p + *(p + 1);
5421 /* xx why this test? */
5422 if (old_regend[r] >=
5430 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
5431 PUSH_FAILURE_POINT(p1 + mcnt, d, -2);
5437 /* Move past the register number and the inner group
5442 /* \<digit> has been turned into a `duplicate' command
5443 which is followed by the numeric value of <digit> as
5444 the register number. (Already passed through
5445 external-to-internal-register mapping, so it refers
5446 to the actual group number, not the non-shy-only
5447 numbering used in the external world.) */
5450 REGISTER re_char *d2, *dend2;
5451 /* Get which register to match against. */
5453 DEBUG_PRINT2("EXECUTING duplicate %d.\n",
5456 /* Can't back reference a group which we've
5458 if (REG_UNSET(regstart[regno])
5459 || REG_UNSET(regend[regno]))
5462 /* Where in input to try to start matching. */
5463 d2 = regstart[regno];
5465 /* Where to stop matching; if both the place to
5466 start and the place to stop matching are in
5467 the same string, then set to the place to
5468 stop, otherwise, for now have to use the end
5469 of the first string. */
5471 dend2 = ((FIRST_STRING_P(regstart[regno])
5472 == FIRST_STRING_P(regend[regno]))
5473 ? regend[regno] : end_match_1);
5475 /* If necessary, advance to next segment
5476 in register contents. */
5477 while (d2 == dend2) {
5478 if (dend2 == end_match_2)
5480 if (dend2 == regend[regno])
5483 /* End of string1 => advance to
5486 dend2 = regend[regno];
5488 /* At end of register contents =>
5493 /* If necessary, advance to next segment
5497 /* How many characters left in this segment to match. */
5500 /* Want how many consecutive characters
5501 we can match in one shot, so, if
5502 necessary, adjust the count. */
5503 if (mcnt > dend2 - d2)
5506 /* Compare that many; failure if
5507 mismatch, else move past them. */
5508 if (TRANSLATE_P(translate)
5510 (const unsigned char*)d,
5511 (const unsigned char*)d2,
5513 : memcmp(d, d2, mcnt)) {
5516 d += mcnt, d2 += mcnt;
5518 /* Do this because we've match some
5525 /* begline matches the empty string at the beginning of
5526 the string (unless `not_bol' is set in `bufp'), and,
5527 if `newline_anchor' is set, after newlines. */
5529 DEBUG_PRINT1("EXECUTING begline.\n");
5531 if (AT_STRINGS_BEG(d)) {
5534 } else if (d[-1] == '\n' && bufp->newline_anchor) {
5537 /* In all other cases, we fail. */
5540 /* endline is the dual of begline. */
5542 DEBUG_PRINT1("EXECUTING endline.\n");
5544 if (AT_STRINGS_END(d)) {
5549 /* We have to ``prefetch'' the next character. */
5550 else if ((d == end1 ? *string2 : *d) == '\n'
5551 && bufp->newline_anchor) {
5556 /* Match at the very beginning of the data. */
5558 DEBUG_PRINT1("EXECUTING begbuf.\n");
5559 if (AT_STRINGS_BEG(d))
5563 /* Match at the very end of the data. */
5565 DEBUG_PRINT1("EXECUTING endbuf.\n");
5566 if (AT_STRINGS_END(d))
5570 /* on_failure_keep_string_jump is used to optimize
5571 `.*\n'. It pushes NULL as the value for the string
5572 on the stack. Then `pop_failure_point' will keep the
5573 current value for the string, instead of restoring
5574 it. To see why, consider matching `foo\nbar' against
5575 `.*\n'. The .* matches the foo; then the . fails
5576 against the \n. But the next thing we want to do is
5577 match the \n against the \n; if we restored the
5578 string value, we would be back at the foo.
5580 Because this is used only in specific cases, we don't
5581 need to check all the things that `on_failure_jump'
5582 does, to make sure the right things get saved on the
5583 stack. Hence we don't share its code. The only
5584 reason to push anything on the stack at all is that
5585 otherwise we would have to change `anychar's code to
5586 do something besides goto fail in this case; that
5587 seems worse than this. */
5588 case on_failure_keep_string_jump:
5589 DEBUG_PRINT1("EXECUTING on_failure_keep_string_jump");
5591 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5592 DEBUG_PRINT3(" %d (to 0x%lx):\n", mcnt,
5595 PUSH_FAILURE_POINT(p + mcnt, (unsigned char *)0, -2);
5598 /* Uses of on_failure_jump:
5600 Each alternative starts with an on_failure_jump that
5601 points to the beginning of the next alternative.
5602 Each alternative except the last ends with a jump
5603 that in effect jumps past the rest of the
5604 alternatives. (They really jump to the ending jump
5605 of the following alternative, because tensioning
5606 these jumps is a hassle.)
5608 Repeats start with an on_failure_jump that points
5609 past both the repetition text and either the
5610 following jump or pop_failure_jump back to this
5612 case on_failure_jump:
5614 DEBUG_PRINT1("EXECUTING on_failure_jump");
5616 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5617 DEBUG_PRINT3(" %d (to 0x%lx)", mcnt, (long)(p + mcnt));
5619 /* If this on_failure_jump comes right before a group
5620 (i.e., the original * applied to a group), save the
5621 information for that group and all inner ones, so
5622 that if we fail back to this point, the group's
5623 information will be correct. For example, in
5624 \(a*\)*\1, we need the preceding group, and in
5625 \(\(a*\)b*\)\2, we need the inner group. */
5627 /* We can't use `p' to check ahead because we push
5628 a failure point to `p + mcnt' after we do this. */
5631 /* We need to skip no_op's before we look for the
5632 start_memory in case this on_failure_jump is
5633 happening as the result of a completed succeed_n, as
5634 in \(a\)\{1,3\}b\1 against aba. */
5635 while (p1 < pend && (re_opcode_t) * p1 == no_op)
5638 if (p1 < pend && (re_opcode_t) * p1 == start_memory) {
5639 /* We have a new highest active register now.
5640 This will get reset at the start_memory we
5641 are about to get to, but we will have saved
5642 all the registers relevant to this repetition
5643 op, as described above. */
5644 highest_active_reg = *(p1 + 1) + *(p1 + 2);
5645 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5646 lowest_active_reg = *(p1 + 1);
5649 DEBUG_PRINT1(":\n");
5650 PUSH_FAILURE_POINT(p + mcnt, d, -2);
5653 /* A smart repeat ends with `maybe_pop_jump'.
5654 We change it to either `pop_failure_jump' or `jump'. */
5655 case maybe_pop_jump:
5656 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5657 DEBUG_PRINT2("EXECUTING maybe_pop_jump %d.\n", mcnt);
5659 REGISTER const unsigned char *p2 = p;
5661 /* Compare the beginning of the repeat with what
5662 in the pattern follows its end. If we can
5663 establish that there is nothing that they
5664 would both match, i.e., that we would have to
5665 backtrack because of (as in, e.g., `a*a')
5666 then we can change to pop_failure_jump,
5667 because we'll never have to backtrack.
5669 This is not true in the case of alternatives:
5670 in `(a|ab)*' we do need to backtrack to the
5671 `ab' alternative (e.g., if the string was
5672 `ab'). But instead of trying to detect that
5673 here, the alternative has put on a dummy
5674 failure point which is what we will end up
5677 /* Skip over open/close-group commands. If what
5678 follows this loop is a ...+ construct, look
5679 at what begins its body, since we will have
5680 to match at least one of that. */
5683 && ((re_opcode_t) * p2 ==
5685 || (re_opcode_t) * p2 ==
5688 else if (p2 + 6 < pend
5689 && (re_opcode_t) * p2 ==
5697 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
5698 to the `maybe_finalize_jump' of this case. Examine what
5701 /* If we're at the end of the pattern, we can change. */
5703 /* Consider what happens when matching ":\(.*\)"
5704 against ":/". I don't really understand this code
5706 p[-3] = (unsigned char)pop_failure_jump;
5708 (" End of pattern: change to `pop_failure_jump'.\n");
5711 else if ((re_opcode_t) * p2 == exactn
5712 || (bufp->newline_anchor
5713 && (re_opcode_t) * p2 ==
5715 REGISTER unsigned char c =
5717 (unsigned char)endline ? '\n' :
5720 if ((re_opcode_t) p1[3] == exactn
5726 (" %c != %c => pop_failure_jump.\n",
5730 else if ((re_opcode_t) p1[3] == charset
5731 || (re_opcode_t) p1[3] ==
5734 (re_opcode_t) p1[3] ==
5738 (unsigned char)(p1[4] *
5742 BYTEWIDTH] & (1 << (c
5747 /* `not_p' is equal to 1 if c would match, which means
5748 that we can't change to pop_failure_jump. */
5754 (" No match => pop_failure_jump.\n");
5757 } else if ((re_opcode_t) * p2 == charset) {
5758 #ifdef REGEX_DEBUG_FLAG
5759 REGISTER unsigned char c
5762 (unsigned char)endline ? '\n' :
5766 if ((re_opcode_t) p1[3] == exactn
5767 && !((int)p2[1] * BYTEWIDTH >
5769 && (p2[2 + p1[5] / BYTEWIDTH]
5777 (" %c != %c => pop_failure_jump.\n",
5781 else if ((re_opcode_t) p1[3] ==
5784 /* We win if the charset_not inside the loop
5785 lists every character listed in the charset after. */
5786 for (idx = 0; idx < (int)p2[1];
5804 (" No match => pop_failure_jump.\n");
5806 } else if ((re_opcode_t) p1[3] ==
5809 /* We win if the charset inside the loop
5810 has no overlap with the one after the loop. */
5813 && idx < (int)p1[4]; idx++)
5824 (" No match => pop_failure_jump.\n");
5829 p -= 2; /* Point at relative address again. */
5830 if ((re_opcode_t) p[-1] != pop_failure_jump) {
5831 p[-1] = (unsigned char)jump;
5832 DEBUG_PRINT1(" Match => jump.\n");
5833 goto unconditional_jump;
5835 /* Note fall through. */
5837 /* The end of a simple repeat has a pop_failure_jump
5838 back to its matching on_failure_jump, where the
5839 latter will push a failure point. The
5840 pop_failure_jump takes off failure points put on by
5841 this pop_failure_jump's matching on_failure_jump; we
5842 got through the pattern to here from the matching
5843 on_failure_jump, so didn't fail. */
5844 case pop_failure_jump: {
5845 /* We need to pass separate storage for the
5846 lowest and highest registers, even though we
5847 don't care about the actual values.
5848 Otherwise, we will restore only one register
5849 from the stack, since lowest will == highest
5850 in `pop_failure_point'. */
5851 int dummy_low_reg, dummy_high_reg;
5852 const unsigned char *pdummy = NULL;
5853 const unsigned char *sdummy = NULL;
5855 DEBUG_PRINT1("EXECUTING pop_failure_jump.\n");
5856 POP_FAILURE_POINT(sdummy, pdummy,
5857 dummy_low_reg, dummy_high_reg,
5858 reg_dummy, reg_dummy,
5860 SXE_SET_UNUSED(pdummy), SXE_SET_UNUSED(sdummy);
5862 /* Note fall through. */
5864 /* Unconditionally jump (without popping any failure
5868 /* Get the amount to jump. */
5869 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5870 DEBUG_PRINT2("EXECUTING jump %d ", mcnt);
5871 p += mcnt; /* Do the jump. */
5872 DEBUG_PRINT2("(to 0x%lx).\n", (long)p);
5875 /* We need this opcode so we can detect where alternatives end
5876 in `group_match_null_string_p' et al. */
5878 DEBUG_PRINT1("EXECUTING jump_past_alt.\n");
5879 goto unconditional_jump;
5881 /* Normally, the on_failure_jump pushes a failure point, which
5882 then gets popped at pop_failure_jump. We will end up at
5883 pop_failure_jump, also, and with a pattern of, say, `a+', we
5884 are skipping over the on_failure_jump, so we have to push
5885 something meaningless for pop_failure_jump to pop. */
5886 case dummy_failure_jump:
5887 DEBUG_PRINT1("EXECUTING dummy_failure_jump.\n");
5888 /* It doesn't matter what we push for the string here. What
5889 the code at `fail' tests is the value for the pattern. */
5890 PUSH_FAILURE_POINT(NULL, NULL, -2);
5891 goto unconditional_jump;
5893 /* At the end of an alternative, we need to push a dummy failure
5894 point in case we are followed by a `pop_failure_jump', because
5895 we don't want the failure point for the alternative to be
5896 popped. For example, matching `(a|ab)*' against `aab'
5897 requires that we match the `ab' alternative. */
5898 case push_dummy_failure:
5899 DEBUG_PRINT1("EXECUTING push_dummy_failure.\n");
5900 /* See comments just above at `dummy_failure_jump' about the
5902 PUSH_FAILURE_POINT(NULL, NULL, -2);
5905 /* Have to succeed matching what follows at least n times.
5906 After that, handle like `on_failure_jump'. */
5908 EXTRACT_NUMBER(mcnt, p + 2);
5909 DEBUG_PRINT2("EXECUTING succeed_n %d.\n", mcnt);
5912 /* Originally, this is how many times we HAVE to succeed. */
5916 STORE_NUMBER_AND_INCR(p, mcnt);
5917 DEBUG_PRINT3(" Setting 0x%lx to %d.\n",
5919 } else if (mcnt == 0) {
5921 (" Setting two bytes from 0x%lx to no_op.\n",
5923 p[2] = (unsigned char)no_op;
5924 p[3] = (unsigned char)no_op;
5930 EXTRACT_NUMBER(mcnt, p + 2);
5931 DEBUG_PRINT2("EXECUTING jump_n %d.\n", mcnt);
5933 /* Originally, this is how many times we CAN jump. */
5936 STORE_NUMBER(p + 2, mcnt);
5937 goto unconditional_jump;
5939 /* If don't have to jump any more, skip over the rest of command. */
5946 DEBUG_PRINT1("EXECUTING set_number_at.\n");
5948 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5950 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5951 DEBUG_PRINT3(" Setting 0x%lx to %d.\n",
5953 STORE_NUMBER(p1, mcnt);
5958 DEBUG_PRINT1("EXECUTING wordbound.\n");
5963 /* Straightforward and (I hope) correct implementation.
5964 Probably should be optimized by arranging to compute
5966 /* emch1 is the character before d, syn1 is the syntax of
5967 emch1, emch2 is the character at d, and syn2 is the
5969 Emchar emch1, emch2;
5970 /* GCC isn't smart enough to see these are initialized if used. */
5971 int syn1 = 0, syn2 = 0;
5972 re_char *d_before, *d_after;
5974 at_beg = AT_STRINGS_BEG(d),
5975 at_end = AT_STRINGS_END(d);
5980 if (at_beg && at_end) {
5985 POS_BEFORE_GAP_UNSAFE(d);
5986 DEC_CHARPTR(d_before);
5988 charptr_emchar(d_before);
5991 SYNTAX_CACHE_BYTE_TO_CHAR
5992 (PTR_TO_OFFSET(d)) - 1;
5993 UPDATE_SYNTAX_CACHE(xpos);
5995 syn1 = SYNTAX_FROM_CACHE
5997 (regex_emacs_buffer->
5998 mirror_syntax_table),
6003 POS_AFTER_GAP_UNSAFE(d);
6004 emch2 = charptr_emchar(d_after);
6007 SYNTAX_CACHE_BYTE_TO_CHAR
6009 UPDATE_SYNTAX_CACHE_FORWARD(xpos
6013 syn2 = SYNTAX_FROM_CACHE
6015 (regex_emacs_buffer->
6016 mirror_syntax_table),
6021 result = (syn2 == Sword);
6023 result = (syn1 == Sword);
6030 if (result == should_succeed)
6036 DEBUG_PRINT1("EXECUTING notwordbound.\n");
6038 goto matchwordbound;
6041 DEBUG_PRINT1("EXECUTING wordbeg.\n");
6042 if (AT_STRINGS_END(d))
6045 /* XEmacs: this originally read:
6047 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
6051 re_char *dtmp = POS_AFTER_GAP_UNSAFE(d);
6052 Emchar emch = charptr_emchar(dtmp);
6055 SYNTAX_CACHE_BYTE_TO_CHAR(PTR_TO_OFFSET(d));
6056 UPDATE_SYNTAX_CACHE(charpos);
6058 if (SYNTAX_FROM_CACHE
6060 (regex_emacs_buffer->mirror_syntax_table),
6063 if (AT_STRINGS_BEG(d))
6065 dtmp = POS_BEFORE_GAP_UNSAFE(d);
6067 emch = charptr_emchar(dtmp);
6069 UPDATE_SYNTAX_CACHE_BACKWARD(charpos - 1);
6071 if (SYNTAX_FROM_CACHE
6073 (regex_emacs_buffer->mirror_syntax_table),
6080 DEBUG_PRINT1("EXECUTING wordend.\n");
6081 if (AT_STRINGS_BEG(d))
6084 /* XEmacs: this originally read:
6086 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
6087 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
6090 The or condition is incorrect (reversed).
6096 SYNTAX_CACHE_BYTE_TO_CHAR(PTR_TO_OFFSET(d))
6098 UPDATE_SYNTAX_CACHE(charpos);
6100 dtmp = POS_BEFORE_GAP_UNSAFE(d);
6102 emch = charptr_emchar(dtmp);
6103 if (SYNTAX_FROM_CACHE
6105 (regex_emacs_buffer->mirror_syntax_table),
6108 if (AT_STRINGS_END(d))
6110 dtmp = POS_AFTER_GAP_UNSAFE(d);
6111 emch = charptr_emchar(dtmp);
6113 UPDATE_SYNTAX_CACHE_FORWARD(charpos + 1);
6115 if (SYNTAX_FROM_CACHE
6117 (regex_emacs_buffer->mirror_syntax_table),
6125 DEBUG_PRINT1("EXECUTING before_dot.\n");
6127 (NILP(regex_match_object)
6128 || BUFFERP(regex_match_object))
6129 || (BUF_PTR_BYTE_POS(regex_emacs_buffer, d)
6130 >= BUF_PT(regex_emacs_buffer)))
6135 DEBUG_PRINT1("EXECUTING at_dot.\n");
6137 (NILP(regex_match_object)
6138 || BUFFERP(regex_match_object))
6139 || (BUF_PTR_BYTE_POS(regex_emacs_buffer, d)
6140 != BUF_PT(regex_emacs_buffer)))
6145 DEBUG_PRINT1("EXECUTING after_dot.\n");
6147 (NILP(regex_match_object)
6148 || BUFFERP(regex_match_object))
6149 || (BUF_PTR_BYTE_POS(regex_emacs_buffer, d)
6150 <= BUF_PT(regex_emacs_buffer)))
6153 #if 0 /* not emacs19 */
6155 DEBUG_PRINT1("EXECUTING at_dot.\n");
6156 if (BUF_PTR_BYTE_POS
6157 (regex_emacs_buffer,
6158 (unsigned char *)d) + 1 !=
6159 BUF_PT(regex_emacs_buffer))
6162 #endif /* not emacs19 */
6165 DEBUG_PRINT2("EXECUTING syntaxspec %d.\n", mcnt);
6170 DEBUG_PRINT1("EXECUTING Emacs wordchar.\n");
6183 SYNTAX_CACHE_BYTE_TO_CHAR
6185 UPDATE_SYNTAX_CACHE(charpos);
6189 emch = charptr_emchar((const Bufbyte *)d);
6193 (regex_emacs_buffer->mirror_syntax_table),
6194 emch) == (enum syntaxcode)mcnt);
6196 if (matches != should_succeed)
6203 DEBUG_PRINT2("EXECUTING notsyntaxspec %d.\n", mcnt);
6205 goto matchnotsyntax;
6208 DEBUG_PRINT1("EXECUTING Emacs notwordchar.\n");
6212 goto matchornotsyntax;
6215 /* 97/2/17 jhod Mule category code patch */
6224 emch = charptr_emchar((const Bufbyte *)d);
6226 if (check_category_char
6227 (emch, regex_emacs_buffer->category_table,
6228 mcnt, should_succeed))
6234 case notcategoryspec:
6236 goto matchornotcategory;
6237 /* end of category patch */
6239 #else /* not emacs */
6241 DEBUG_PRINT1("EXECUTING non-Emacs wordchar.\n");
6243 if (!WORDCHAR_P_UNSAFE((int)(*d)))
6250 DEBUG_PRINT1("EXECUTING non-Emacs notwordchar.\n");
6252 if (!WORDCHAR_P_UNSAFE((int)(*d)))
6262 /* Successfully executed one pattern command; keep going. */
6265 /* We goto here if a matching operation fails. */
6267 if (!FAIL_STACK_EMPTY()) {
6268 /* A restart point is known. Restore to that state. */
6269 DEBUG_PRINT1("\nFAIL:\n");
6270 POP_FAILURE_POINT(d, p, lowest_active_reg,
6271 highest_active_reg, regstart, regend,
6274 /* If this failure point is a dummy, try the next one. */
6278 /* If we failed to the end of the pattern, don't examine
6282 re_bool is_a_jump_n = false;
6284 /* If failed to a backwards jump that's part of
6285 a repetition loop, need to pop this failure
6286 point and use the next one. */
6287 switch ((unsigned int)(re_opcode_t)*p) {
6290 case maybe_pop_jump:
6291 case pop_failure_jump:
6294 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6298 && (re_opcode_t) * p1 == succeed_n)
6300 && (re_opcode_t) * p1 ==
6309 if (d >= string1 && d <= end1)
6312 break; /* Matching at this starting point really fails. */
6316 goto restore_best_regs;
6320 return -1; /* Failure to match. */
6324 /* Subroutine definitions for re_match_2. */
6326 /* We are passed P pointing to a register number after a start_memory.
6328 Return true if the pattern up to the corresponding stop_memory can
6329 match the empty string, and false otherwise.
6331 If we find the matching stop_memory, sets P to point to one past its number.
6332 Otherwise, sets P to an undefined byte less than or equal to END.
6334 We don't handle duplicates properly (yet). */
6337 group_match_null_string_p(unsigned char **p, unsigned char *end,
6338 register_info_type * register_info)
6341 /* Point to after the args to the start_memory. */
6342 unsigned char *p1 = *p + 2;
6345 /* Skip over opcodes that can match nothing, and return true or
6346 false, as appropriate, when we get to one that can't, or to the
6347 matching stop_memory. */
6349 switch ((unsigned int)(re_opcode_t)*p1) {
6350 /* Could be either a loop or a series of alternatives. */
6351 case on_failure_jump:
6353 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6355 /* If the next operation is not a jump backwards in the
6359 /* Go through the on_failure_jumps of the alternatives,
6360 seeing if any of the alternatives cannot match nothing.
6361 The last alternative starts with only a jump,
6362 whereas the rest start with on_failure_jump and end
6363 with a jump, e.g., here is the pattern for `a|b|c':
6365 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
6366 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
6369 So, we have to first go through the first (n-1)
6370 alternatives and then deal with the last one separately. */
6372 /* Deal with the first (n-1) alternatives, which start
6373 with an on_failure_jump (see above) that jumps to right
6374 past a jump_past_alt. */
6376 while ((re_opcode_t) p1[mcnt - 3] ==
6378 /* `mcnt' holds how many bytes long the alternative
6379 is, including the ending `jump_past_alt' and
6382 if (!alt_match_null_string_p
6383 (p1, p1 + mcnt - 3, register_info))
6386 /* Move to right after this alternative, including the
6390 /* Break if it's the beginning of an n-th alternative
6391 that doesn't begin with an on_failure_jump. */
6392 if ((re_opcode_t) * p1 !=
6396 /* Still have to check that it's not an n-th
6397 alternative that starts with an on_failure_jump. */
6399 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6400 if ((re_opcode_t) p1[mcnt - 3] !=
6402 /* Get to the beginning of the n-th alternative. */
6408 /* Deal with the last alternative: go back and get number
6409 of the `jump_past_alt' just before it. `mcnt' contains
6410 the length of the alternative. */
6411 EXTRACT_NUMBER(mcnt, p1 - 2);
6413 if (!alt_match_null_string_p
6414 (p1, p1 + mcnt, register_info))
6417 p1 += mcnt; /* Get past the n-th alternative. */
6422 assert(p1[1] == **p);
6427 if (!common_op_match_null_string_p(&p1, end, register_info))
6430 } /* while p1 < end */
6433 } /* group_match_null_string_p */
6435 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
6436 It expects P to be the first byte of a single alternative and END one
6437 byte past the last. The alternative can contain groups. */
6440 alt_match_null_string_p(unsigned char *p, unsigned char *end,
6441 register_info_type * register_info)
6444 unsigned char *p1 = p;
6447 /* Skip over opcodes that can match nothing, and break when we get
6448 to one that can't. */
6450 switch ((unsigned int)(re_opcode_t)*p1) {
6452 case on_failure_jump:
6454 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6459 if (!common_op_match_null_string_p(&p1, end, register_info))
6462 } /* while p1 < end */
6465 } /* alt_match_null_string_p */
6467 /* Deals with the ops common to group_match_null_string_p and
6468 alt_match_null_string_p.
6470 Sets P to one after the op and its arguments, if any. */
6473 common_op_match_null_string_p(unsigned char **p, unsigned char *end,
6474 register_info_type * register_info)
6479 unsigned char *p1 = *p;
6481 switch ((unsigned int)(re_opcode_t)*p1++) {
6500 assert(reg_no > 0 && reg_no <= MAX_REGNUM);
6501 ret = group_match_null_string_p(&p1, end, register_info);
6503 /* Have to set this here in case we're checking a group which
6504 contains a group and a back reference to it. */
6506 if (REG_MATCH_NULL_STRING_P(register_info[reg_no]) ==
6507 MATCH_NULL_UNSET_VALUE)
6508 REG_MATCH_NULL_STRING_P(register_info[reg_no]) = ret;
6514 /* If this is an optimized succeed_n for zero times, make the jump. */
6516 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6524 /* Get to the number of times to succeed. */
6526 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6530 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6537 if (!REG_MATCH_NULL_STRING_P(register_info[*p1]))
6546 /* All other opcodes mean we cannot match the empty string. */
6552 } /* common_op_match_null_string_p */
6554 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6555 bytes; nonzero otherwise. */
6558 bcmp_translate(re_char *s1, re_char *s2,
6559 REGISTER int len, RE_TRANSLATE_TYPE translate)
6561 REGISTER const unsigned char *p1 = s1, *p2 = s2;
6563 const unsigned char *p1_end = s1 + len;
6564 const unsigned char *p2_end = s2 + len;
6566 while (p1 != p1_end && p2 != p2_end) {
6567 Emchar p1_ch, p2_ch;
6569 p1_ch = charptr_emchar(p1);
6570 p2_ch = charptr_emchar(p2);
6572 if (RE_TRANSLATE(p1_ch)
6573 != RE_TRANSLATE(p2_ch))
6578 #else /* not MULE */
6580 if (RE_TRANSLATE(*p1++) != RE_TRANSLATE(*p2++))
6588 /* Entry points for GNU code. */
6590 /* re_compile_pattern is the GNU regular expression compiler: it
6591 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6592 Returns 0 if the pattern was valid, otherwise an error string.
6594 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6595 are set in BUFP on entry.
6597 We call regex_compile to do the actual compilation. */
6599 const char *re_compile_pattern(const char *pattern, int length,
6600 struct re_pattern_buffer *bufp)
6604 /* GNU code is written to assume at least RE_NREGS registers will be set
6605 (and at least one extra will be -1). */
6606 bufp->regs_allocated = REGS_UNALLOCATED;
6608 /* And GNU code determines whether or not to get register information
6609 by passing null for the REGS argument to re_match, etc., not by
6613 /* Match anchors at newline. */
6614 bufp->newline_anchor = 1;
6616 ret = regex_compile((const unsigned char*)pattern,
6617 length, re_syntax_options, bufp);
6621 return gettext(re_error_msgid[(int)ret]);
6624 /* Entry points compatible with 4.2 BSD regex library. We don't define
6625 them unless specifically requested. */
6627 #ifdef _REGEX_RE_COMP
6629 /* BSD has one and only one pattern buffer. */
6630 static struct re_pattern_buffer re_comp_buf;
6632 char *re_comp(const char *s)
6637 if (!re_comp_buf.buffer)
6638 return gettext("No previous regular expression");
6642 if (!re_comp_buf.buffer) {
6643 re_comp_buf.buffer = (unsigned char *)xmalloc_atomic(200);
6644 if (re_comp_buf.buffer == NULL)
6645 return gettext(re_error_msgid[(int)REG_ESPACE]);
6646 re_comp_buf.allocated = 200;
6648 re_comp_buf.fastmap = (char *)xmalloc_atomic(1 << BYTEWIDTH);
6649 if (re_comp_buf.fastmap == NULL)
6650 return gettext(re_error_msgid[(int)REG_ESPACE]);
6653 /* Since `re_exec' always passes NULL for the `regs' argument, we
6654 don't need to initialize the pattern buffer fields which affect it. */
6656 /* Match anchors at newlines. */
6657 re_comp_buf.newline_anchor = 1;
6660 regex_compile((unsigned char *)s, strlen(s), re_syntax_options,
6666 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6667 return (char *)gettext(re_error_msgid[(int)ret]);
6670 int re_exec(const char *s)
6672 const int len = strlen(s);
6674 0 <= re_search(&re_comp_buf, s, len, 0, len,
6675 (struct re_registers *)0);
6677 #endif /* _REGEX_RE_COMP */
6679 /* POSIX.2 functions. Don't define these for Emacs. */
6683 /* regcomp takes a regular expression as a string and compiles it.
6685 PREG is a regex_t *. We do not expect any fields to be initialized,
6686 since POSIX says we shouldn't. Thus, we set
6688 `buffer' to the compiled pattern;
6689 `used' to the length of the compiled pattern;
6690 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6691 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6692 RE_SYNTAX_POSIX_BASIC;
6693 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6694 `fastmap' and `fastmap_accurate' to zero;
6695 `re_nsub' to the number of subexpressions in PATTERN.
6696 (non-shy of course. POSIX probably doesn't know about
6697 shy ones, and in any case they should be invisible.)
6699 PATTERN is the address of the pattern string.
6701 CFLAGS is a series of bits which affect compilation.
6703 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6704 use POSIX basic syntax.
6706 If REG_NEWLINE is set, then . and [^...] don't match newline.
6707 Also, regexec will try a match beginning after every newline.
6709 If REG_ICASE is set, then we considers upper- and lowercase
6710 versions of letters to be equivalent when matching.
6712 If REG_NOSUB is set, then when PREG is passed to regexec, that
6713 routine will report only success or failure, and nothing about the
6716 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6717 the return codes and their meanings.) */
6719 int regcomp(regex_t * preg, const char *pattern, int cflags)
6723 = (cflags & REG_EXTENDED) ?
6724 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6726 /* regex_compile will allocate the space for the compiled pattern. */
6728 preg->allocated = 0;
6731 /* Don't bother to use a fastmap when searching. This simplifies the
6732 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
6733 characters after newlines into the fastmap. This way, we just try
6737 if (cflags & REG_ICASE) {
6740 preg->translate = (char *)xmalloc_atomic(CHAR_SET_SIZE);
6741 if (preg->translate == NULL)
6742 return (int)REG_ESPACE;
6744 /* Map uppercase characters to corresponding lowercase ones. */
6745 for (i = 0; i < CHAR_SET_SIZE; i++)
6746 preg->translate[i] = ISUPPER(i) ? tolower(i) : i;
6748 preg->translate = NULL;
6750 /* If REG_NEWLINE is set, newlines are treated differently. */
6751 if (cflags & REG_NEWLINE) {
6752 /* REG_NEWLINE implies neither . nor [^...] match newline. */
6753 syntax &= ~RE_DOT_NEWLINE;
6754 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6755 /* It also changes the matching behavior. */
6756 preg->newline_anchor = 1;
6758 preg->newline_anchor = 0;
6760 preg->no_sub = !!(cflags & REG_NOSUB);
6762 /* POSIX says a null character in the pattern terminates it, so we
6763 can use strlen here in compiling the pattern. */
6764 ret = regex_compile((const unsigned char*)pattern,
6765 strlen(pattern), syntax, preg);
6767 /* POSIX doesn't distinguish between an unmatched open-group and an
6768 unmatched close-group: both are REG_EPAREN. */
6769 if (ret == REG_ERPAREN)
6775 /* regexec searches for a given pattern, specified by PREG, in the
6778 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6779 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6780 least NMATCH elements, and we set them to the offsets of the
6781 corresponding matched substrings.
6783 EFLAGS specifies `execution flags' which affect matching: if
6784 REG_NOTBOL is set, then ^ does not match at the beginning of the
6785 string; if REG_NOTEOL is set, then $ does not match at the end.
6787 We return 0 if we find a match and REG_NOMATCH if not. */
6790 regexec(const regex_t * preg, const char *string, Element_count nmatch,
6791 regmatch_t pmatch[], int eflags)
6794 struct re_registers regs;
6795 regex_t private_preg;
6796 int len = strlen(string);
6797 re_bool want_reg_info = !preg->no_sub && nmatch > 0;
6799 private_preg = *preg;
6801 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6802 private_preg.not_eol = !!(eflags & REG_NOTEOL);
6804 /* The user has told us exactly how many registers to return
6805 information about, via `nmatch'. We have to pass that on to the
6806 matching routines. */
6807 private_preg.regs_allocated = REGS_FIXED;
6809 if (want_reg_info) {
6810 regs.num_regs = nmatch;
6811 regs.start = TALLOC(nmatch, regoff_t);
6812 regs.end = TALLOC(nmatch, regoff_t);
6813 if (regs.start == NULL || regs.end == NULL)
6814 return (int)REG_NOMATCH;
6817 /* Perform the searching operation. */
6818 ret = re_search(&private_preg, string, len,
6819 /* start: */ 0, /* range: */ len,
6820 want_reg_info ? ®s : (struct re_registers *)0);
6822 /* Copy the register information to the POSIX structure. */
6823 if (want_reg_info) {
6827 for (r = 0; r < nmatch; r++) {
6828 pmatch[r].rm_so = regs.start[r];
6829 pmatch[r].rm_eo = regs.end[r];
6833 /* If we needed the temporary register info, free the space now. */
6838 /* We want zero return to mean success, unlike `re_search'. */
6839 return ret >= 0 ? (int)REG_NOERROR : (int)REG_NOMATCH;
6842 /* Returns a message corresponding to an error code, ERRCODE, returned
6843 from either regcomp or regexec. We don't use PREG here. */
6846 regerror(int errcode, const regex_t * preg, char *errbuf,
6847 Memory_count errbuf_size)
6850 Memory_count msg_size;
6852 if (errcode < 0 || (size_t) errcode >= (sizeof(re_error_msgid)
6853 / sizeof(re_error_msgid[0])))
6854 /* Only error codes returned by the rest of the code should be passed
6855 to this routine. If we are given anything else, or if other regex
6856 code generates an invalid error code, then the program has a bug.
6857 Dump core so we can fix it. */
6860 msg = gettext(re_error_msgid[errcode]);
6862 msg_size = strlen(msg) + 1; /* Includes the null. */
6864 if (errbuf_size != 0) {
6865 strncpy(errbuf, msg, errbuf_size - 1);
6866 errbuf[errbuf_size-1]='\0';
6872 /* Free dynamically allocated space used by PREG. */
6874 void regfree(regex_t * preg)
6876 if (preg->buffer != NULL) {
6877 xfree(preg->buffer);
6879 preg->buffer = NULL;
6881 preg->allocated = 0;
6884 if (preg->fastmap != NULL) {
6885 xfree(preg->fastmap);
6887 preg->fastmap = NULL;
6888 preg->fastmap_accurate = 0;
6890 if (preg->translate != NULL) {
6891 xfree(preg->translate);
6893 preg->translate = NULL;
6896 #endif /* not emacs */
6900 make-backup-files: t
6902 trim-versions-without-asking: nil