1 /* Extended regular expression matching and search library,
2 version 0.12, extended for XEmacs.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
6 Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
7 Copyright (C) 1995 Sun Microsystems, Inc.
8 Copyright (C) 1995 Ben Wing.
10 This file is part of SXEmacs
12 SXEmacs is free software: you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation, either version 3 of the License, or
15 (at your option) any later version.
17 SXEmacs is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program. If not, see <http://www.gnu.org/licenses/>. */
26 /* Synched up with: FSF 19.29. */
28 /* Changes made for XEmacs:
30 (1) the REGEX_BEGLINE_CHECK code from the XEmacs v18 regex routines
31 was added. This causes a huge speedup in font-locking.
32 (2) Rel-alloc is disabled when the MMAP version of rel-alloc is
33 being used, because it's too slow -- all those calls to mmap()
34 add humongous overhead.
35 (3) Lots and lots of changes for Mule. They are bracketed by
36 `#ifdef MULE' or with comments that have `XEmacs' in them.
43 #ifndef REGISTER /* Rigidly enforced as of 20.3 */
52 /* Converts the pointer to the char to BEG-based offset from the start. */
53 #define PTR_TO_OFFSET(d) (MATCHING_IN_FIRST_STRING \
54 ? (d) - string1 : (d) - (string2 - size1))
56 #define PTR_TO_OFFSET(d) 0
59 /* We assume non-Mule if emacs isn't defined. */
64 /* We need this for `regex.h', and perhaps for the Emacs include files. */
65 #include <sys/types.h>
67 /* This is for other GNU distributions with internationalized messages. */
68 #if defined (I18N3) && (defined (HAVE_LIBINTL_H) || defined (_LIBC))
71 # define gettext(msgid) (msgid)
74 /* XEmacs: define this to add in a speedup for patterns anchored at
75 the beginning of a line. Keep the ifdefs so that it's easier to
76 tell where/why this code has diverged from v19. */
77 #define REGEX_BEGLINE_CHECK
79 /* XEmacs: the current mmap-based ralloc handles small blocks very
80 poorly, so we disable it here. */
82 #if (defined (REL_ALLOC) && defined (HAVE_MMAP)) || defined(DOUG_LEA_MALLOC)
86 /* The `emacs' switch turns on certain matching commands
87 that make sense only in Emacs. */
96 Lisp_Object Vthe_lisp_rangetab;
98 void complex_vars_of_regex(void)
100 Vthe_lisp_rangetab = Fmake_range_table();
101 staticpro(&Vthe_lisp_rangetab);
106 void complex_vars_of_regex(void)
112 #define RE_TRANSLATE(ch) TRT_TABLE_OF (translate, (Emchar) ch)
113 #define TRANSLATE_P(tr) (!NILP (tr) && CHAR_TABLEP(tr))
115 /* just massage that xfree thing */
117 #if defined HAVE_BDWGC && defined EF_USE_BDWGC
125 /* If we are not linking with Emacs proper,
126 we can't use the relocating allocator
127 even if config.h says that we can. */
130 #if defined (STDC_HEADERS) || defined (_LIBC)
132 # include <stdbool.h>
138 /* Types normally included via lisp.h */
139 #include <stddef.h> /* for ptrdiff_t */
142 #ifndef DECLARE_NOTHING
143 #define DECLARE_NOTHING struct nosuchstruct
149 #define charptr_emchar(str) ((Emchar) (str)[0])
151 #define INC_CHARPTR(p) ((p)++)
152 #define DEC_CHARPTR(p) ((p)--)
156 /* Define the syntax stuff for \<, \>, etc. */
158 /* This must be nonzero for the wordchar and notwordchar pattern
159 commands in re_match_2. */
166 extern char *re_syntax_table;
168 #else /* not SYNTAX_TABLE */
170 /* How many characters in the character set. */
171 #define CHAR_SET_SIZE 256
173 static char re_syntax_table[CHAR_SET_SIZE];
175 static void init_syntax_once(void)
180 const char *word_syntax_chars =
181 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
183 memset(re_syntax_table, 0, sizeof(re_syntax_table));
185 while (*word_syntax_chars)
186 re_syntax_table[(unsigned int)(*word_syntax_chars++)] =
193 #endif /* SYNTAX_TABLE */
195 #define SYNTAX_UNSAFE(ignored, c) re_syntax_table[c]
196 #undef SYNTAX_FROM_CACHE
197 #define SYNTAX_FROM_CACHE SYNTAX_UNSAFE
199 #define RE_TRANSLATE(c) translate[(unsigned char) (c)]
200 #define TRANSLATE_P(tr) tr
202 #if defined HAVE_BDWGC && defined EF_USE_BDWGC
203 # if defined HAVE_GC_GC_H
205 # elif defined HAVE_GC_H
208 # define xmalloc GC_MALLOC
209 # define xmalloc_atomic GC_MALLOC_ATOMIC
210 # define xrealloc GC_REALLOC
211 # define xfree GC_FREE
212 # define xstrdup GC_STRDUP
213 #else /* !HAVE_BDWGC */
214 # define xmalloc malloc
215 # define xmalloc_atomic malloc
216 # define xrealloc realloc
218 # define xstrdup strdup
219 #endif /* HAVE_BDWGC */
224 /* Under XEmacs, this is needed because we don't define it elsewhere. */
225 #ifdef SWITCH_ENUM_BUG
226 #define SWITCH_ENUM_CAST(x) ((int)(x))
228 #define SWITCH_ENUM_CAST(x) (x)
231 /* Get the interface, including the syntax bits. */
234 /* isalpha etc. are used for the character classes. */
237 /* Jim Meyering writes:
239 "... Some ctype macros are valid only for character codes that
240 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
241 using /bin/cc or gcc but without giving an ansi option). So, all
242 ctype uses should be through macros like ISPRINT... If
243 STDC_HEADERS is defined, then autoconf has verified that the ctype
244 macros don't need to be guarded with references to isascii. ...
245 Defining isascii to 1 should let any compiler worth its salt
246 eliminate the && through constant folding." */
248 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
249 #define ISASCII_1(c) 1
251 #define ISASCII_1(c) isascii(c)
255 /* The IS*() macros can be passed any character, including an extended
256 one. We need to make sure there are no crashes, which would occur
257 otherwise due to out-of-bounds array references. */
258 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c))
260 #define ISASCII(c) ISASCII_1 (c)
264 #define ISBLANK(c) (ISASCII (c) && isblank (c))
266 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
269 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
271 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
274 /* stolen from FSF/src/charset.h */
275 #define SINGLE_BYTE_CHAR_P(c) (((unsigned int)(c) & 0xFF) == (c))
276 /* 1 if C is an ASCII character. */
277 #define IS_REAL_ASCII(c) ((c) < 128)
278 /* 1 if C is a unibyte character. */
279 #define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P(c))
281 #define ISPRINT(c) (ISASCII (c) && isprint (c))
282 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
283 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
284 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
285 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
286 #define ISLOWER(c) (ISASCII (c) && islower (c))
287 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
288 #define ISSPACE(c) (ISASCII (c) && isspace (c))
289 #define ISUPPER(c) (ISASCII (c) && isupper (c))
290 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
295 __attribute__((always_inline));
299 Lisp_Object ct = regex_emacs_buffer->mirror_syntax_table;
301 return (SYNTAX(XCHAR_TABLE(ct), c) == Sword);
304 # define ISWORD(c) (ISALPHA(c))
308 #define NULL (void *)0
311 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
312 since ours (we hope) works properly with all combinations of
313 machines, compilers, `char' and `unsigned char' argument types.
314 (Per Bothner suggested the basic approach.) */
315 #undef SIGN_EXTEND_CHAR
317 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
318 #else /* not __STDC__ */
319 /* As in Harbison and Steele. */
320 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
323 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
324 use `alloca' instead of `malloc'. This is because using malloc in
325 re_search* or re_match* could cause memory leaks when C-g is used in
326 Emacs; also, malloc is slower and causes storage fragmentation. On
327 the other hand, malloc is more portable, and easier to debug.
329 Because we sometimes use alloca, some routines have to be macros,
330 not functions -- `alloca'-allocated space disappears at the end of the
331 function it is called in. */
335 #define REGEX_ALLOCATE xmalloc
336 #define REGEX_REALLOCATE(source, osize, nsize) xrealloc(source, nsize)
337 #define REGEX_FREE xfree
339 #else /* !REGEX_MALLOC */
341 /* Emacs already defines alloca, sometimes. */
344 /* Make alloca work the best possible way. */
346 #define alloca __builtin_alloca
347 #else /* not __GNUC__ */
350 #else /* not __GNUC__ or HAVE_ALLOCA_H */
351 #ifndef _AIX /* Already did AIX, up at the top. */
353 #endif /* not _AIX */
354 #endif /* HAVE_ALLOCA_H */
355 #endif /* __GNUC__ */
357 #endif /* not alloca */
359 #define REGEX_ALLOCATE alloca
361 /* Assumes a `char *destination' variable. */
362 #define REGEX_REALLOCATE(source, osize, nsize) \
363 (destination = (char *) alloca (nsize), \
364 memmove (destination, source, osize), \
367 /* No need to do anything to free, after alloca. */
368 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
370 #endif /* REGEX_MALLOC */
372 /* Define how to allocate the failure stack. */
376 #define REGEX_ALLOCATE_STACK(size) \
377 r_alloc ((char **) &failure_stack_ptr, (size))
378 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
379 r_re_alloc ((char **) &failure_stack_ptr, (nsize))
380 #define REGEX_FREE_STACK(ptr) \
381 r_alloc_free ((void **) &failure_stack_ptr)
383 #else /* !REL_ALLOC */
387 #define REGEX_ALLOCATE_STACK xmalloc
388 #define REGEX_REALLOCATE_STACK(source, osize, nsize) xrealloc(source, nsize)
389 #define REGEX_FREE_STACK xfree
391 #else /* !REGEX_MALLOC */
393 #define REGEX_ALLOCATE_STACK alloca
395 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
396 REGEX_REALLOCATE (source, osize, nsize)
397 /* No need to explicitly free anything. */
398 #define REGEX_FREE_STACK(arg)
400 #endif /* REGEX_MALLOC */
401 #endif /* REL_ALLOC */
403 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
404 `string1' or just past its end. This works if PTR is NULL, which is
406 #define FIRST_STRING_P(ptr) \
407 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
409 /* (Re)Allocate N items of type T using malloc, or fail. */
410 #define TALLOC(n, t) \
411 ((t *)xmalloc((n) * sizeof (t)))
412 #define TALLOC_ATOMIC(n, t) \
413 ((t *)xmalloc_atomic((n) * sizeof (t)))
414 #define RETALLOC(addr, n, t) \
415 ((addr) = (t *)xrealloc(addr, (n) * sizeof (t)))
416 #define RETALLOC_IF(addr, n, t) \
418 RETALLOC((addr), (n), t); \
420 (addr) = TALLOC ((n), t)
421 #define REGEX_TALLOC(n, t) \
422 ((t *)REGEX_ALLOCATE((n) * sizeof (t)))
424 #define BYTEWIDTH 8 /* In bits. */
426 #define STREQ(s1, s2) (strcmp (s1, s2) == 0)
430 #define MAX(a, b) ((a) > (b) ? (a) : (b))
431 #define MIN(a, b) ((a) < (b) ? (a) : (b))
433 /* Type of source-pattern and string chars. */
434 typedef const unsigned char re_char;
436 typedef char re_bool;
440 /* These are the command codes that appear in compiled regular
441 expressions. Some opcodes are followed by argument bytes. A
442 command code can specify any interpretation whatsoever for its
443 arguments. Zero bytes may appear in the compiled regular expression. */
448 /* Succeed right away--no more backtracking. */
451 /* Followed by one byte giving n, then by n literal bytes. */
454 /* Matches any (more or less) character. */
457 /* Matches any one char belonging to specified set. First
458 following byte is number of bitmap bytes. Then come bytes
459 for a bitmap saying which chars are in. Bits in each byte
460 are ordered low-bit-first. A character is in the set if its
461 bit is 1. A character too large to have a bit in the map is
462 automatically not in the set. */
465 /* Same parameters as charset, but match any character that is
466 not one of those specified. */
469 /* Start remembering the text that is matched, for storing in a
470 register. Followed by one byte with the register number, in
471 the range 1 to the pattern buffer's re_ngroups
472 field. Then followed by one byte with the number of groups
473 inner to this one. (This last has to be part of the
474 start_memory only because we need it in the on_failure_jump
478 /* Stop remembering the text that is matched and store it in a
479 memory register. Followed by one byte with the register
480 number, in the range 1 to `re_ngroups' in the
481 pattern buffer, and one byte with the number of inner groups,
482 just like `start_memory'. (We need the number of inner
483 groups here because we don't have any easy way of finding the
484 corresponding start_memory when we're at a stop_memory.) */
487 /* Match a duplicate of something remembered. Followed by one
488 byte containing the register number. */
491 /* Fail unless at beginning of line. */
494 /* Fail unless at end of line. */
497 /* Succeeds if at beginning of buffer (if emacs) or at beginning
498 of string to be matched (if not). */
501 /* Analogously, for end of buffer/string. */
504 /* Followed by two byte relative address to which to jump. */
507 /* Same as jump, but marks the end of an alternative. */
510 /* Followed by two-byte relative address of place to resume at
511 in case of failure. */
514 /* Like on_failure_jump, but pushes a placeholder instead of the
515 current string position when executed. */
516 on_failure_keep_string_jump,
518 /* Throw away latest failure point and then jump to following
519 two-byte relative address. */
522 /* Change to pop_failure_jump if know won't have to backtrack to
523 match; otherwise change to jump. This is used to jump
524 back to the beginning of a repeat. If what follows this jump
525 clearly won't match what the repeat does, such that we can be
526 sure that there is no use backtracking out of repetitions
527 already matched, then we change it to a pop_failure_jump.
528 Followed by two-byte address. */
531 /* Jump to following two-byte address, and push a dummy failure
532 point. This failure point will be thrown away if an attempt
533 is made to use it for a failure. A `+' construct makes this
534 before the first repeat. Also used as an intermediary kind
535 of jump when compiling an alternative. */
538 /* Push a dummy failure point and continue. Used at the end of
542 /* Followed by two-byte relative address and two-byte number n.
543 After matching N times, jump to the address upon failure. */
546 /* Followed by two-byte relative address, and two-byte number n.
547 Jump to the address N times, then fail. */
550 /* Set the following two-byte relative address to the
551 subsequent two-byte number. The address *includes* the two
555 wordchar, /* Matches any word-constituent character. */
556 notwordchar, /* Matches any char that is not a word-constituent. */
558 wordbeg, /* Succeeds if at word beginning. */
559 wordend, /* Succeeds if at word end. */
561 wordbound, /* Succeeds if at a word boundary. */
562 notwordbound /* Succeeds if not at a word boundary. */
564 , before_dot, /* Succeeds if before point. */
565 at_dot, /* Succeeds if at point. */
566 after_dot, /* Succeeds if after point. */
568 /* Matches any character whose syntax is specified. Followed by
569 a byte which contains a syntax code, e.g., Sword. */
572 /* Matches any character whose syntax is not that specified. */
576 /* need extra stuff to be able to properly work with XEmacs/Mule
577 characters (which may take up more than one byte) */
578 , charset_mule, /* Matches any character belonging to specified set.
579 The set is stored in "unified range-table
580 format"; see rangetab.c. Unlike the `charset'
581 opcode, this can handle arbitrary characters. */
583 charset_mule_not /* Same parameters as charset_mule, but match any
584 character that is not one of those specified. */
585 /* 97/2/17 jhod: The following two were merged back in from the Mule
586 2.3 code to enable some language specific processing */
587 , categoryspec, /* Matches entries in the character category tables */
588 notcategoryspec /* The opposite of the above */
592 /* Common operations on the compiled pattern. */
594 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
596 #define STORE_NUMBER(destination, number) \
598 (destination)[0] = (number) & 0377; \
599 (destination)[1] = (number) >> 8; \
602 /* Same as STORE_NUMBER, except increment DESTINATION to
603 the byte after where the number is stored. Therefore, DESTINATION
604 must be an lvalue. */
606 #define STORE_NUMBER_AND_INCR(destination, number) \
608 STORE_NUMBER (destination, number); \
609 (destination) += 2; \
612 /* Put into DESTINATION a number stored in two contiguous bytes starting
615 #define EXTRACT_NUMBER(destination, source) \
617 (destination) = *(source) & 0377; \
618 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
621 #ifdef REGEX_DEBUG_FLAG
622 static void extract_number(int *dest, re_char * source)
624 int temp = SIGN_EXTEND_CHAR(*(source + 1));
625 *dest = *source & 0377;
629 #ifndef EXTRACT_MACROS /* To debug the macros. */
630 #undef EXTRACT_NUMBER
631 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
632 #endif /* not EXTRACT_MACROS */
634 #endif /* REGEX_DEBUG_FLAG */
636 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
637 SOURCE must be an lvalue. */
639 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
641 EXTRACT_NUMBER (destination, source); \
645 #ifdef REGEX_DEBUG_FLAG
647 extract_number_and_incr(int *destination, unsigned char **source)
649 extract_number(destination, *source);
653 #ifndef EXTRACT_MACROS
654 #undef EXTRACT_NUMBER_AND_INCR
655 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
656 extract_number_and_incr (&dest, &src)
657 #endif /* not EXTRACT_MACROS */
659 #endif /* REGEX_DEBUG_FLAG */
661 /* If REGEX_DEBUG_FLAG is defined, Regex prints many voluminous messages about
662 what it is doing (if the variable `debug' is nonzero). If linked with the
663 main program in `iregex.c', you can enter patterns and strings interactively.
664 And if linked with the main program in `main.c' and the other test files, you
665 can run the already-written tests. */
667 #if defined (REGEX_DEBUG_FLAG)
669 /* We use standard I/O for debugging. */
673 /* XEmacs provides its own version of assert() */
674 /* It is useful to test things that ``must'' be true when debugging. */
679 /* no point in having this one, since we do not offer any accessor to this */
680 static int debug = 0;
683 #define DEBUG_STATEMENT(e) e
684 #define DEBUG_PRINT1(x) printf(x)
685 #define DEBUG_PRINT2(x1, x2) printf(x1, x2)
686 #define DEBUG_PRINT3(x1, x2, x3) printf(x1, x2, x3)
687 #define DEBUG_PRINT4(x1, x2, x3, x4) printf(x1, x2, x3, x4)
688 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
689 print_partial_compiled_pattern(s, e)
690 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
691 print_double_string (w, s1, sz1, s2, sz2)
693 /* Print the fastmap in human-readable form. */
695 static void print_fastmap(char *fastmap)
697 unsigned was_a_range = 0;
700 while (i < (1 << BYTEWIDTH)) {
704 while (i < (1 << BYTEWIDTH) && fastmap[i]) {
717 /* Print a compiled pattern string in human-readable form, starting at
718 the START pointer into it and ending just before the pointer END. */
720 /* can't help it ... the ugly mule code changes `start' upon alignment,
721 * so this can't have the const qualifier ... bugger */
723 print_partial_compiled_pattern(unsigned char *start, re_char *end)
726 unsigned char *p = start;
734 /* Loop over pattern commands. */
736 printf("%ld:\t", (long)(p - start));
738 switch ((const re_opcode_t)*p++) {
745 printf("/exactn/%d", mcnt);
755 printf("/start_memory/%d/%d", mcnt, *p++);
760 printf("/stop_memory/%d/%d", mcnt, *p++);
764 printf("/duplicate/%d", *p++);
773 REGISTER int c, last = -100;
774 REGISTER int in_range = 0;
776 printf("/charset [%s",
777 (re_opcode_t) * (p - 1) == charset_not
780 assert(p + *p < pend);
782 for (c = 0; c < 256; c++)
783 if (((unsigned char)(c / 8) < *p)
786 /* Are we starting a range? */
787 if (last + 1 == c && !in_range) {
791 /* Have we broken a range? */
792 else if (last + 1 != c
815 case charset_mule_not: {
818 printf("/charset_mule [%s",
819 (re_opcode_t) * (p - 1) == charset_mule_not
821 /* u_r_t_nentries() CHANGES its arg, why on earth
822 * is this marked const here then? :O -hrop */
823 nentries = unified_range_table_nentries(p);
824 for (i = 0; i < nentries; i++) {
825 EMACS_INT first, last;
826 Lisp_Object dummy_val;
828 /* u_r_t_get_range CHANGES its arg ...
829 * p can't be const thence */
830 unified_range_table_get_range(p, i,
837 printf("(0x%lx)", (long)first);
848 p += unified_range_table_bytes_used(p);
861 case on_failure_jump:
862 extract_number_and_incr(&mcnt, &p);
863 printf("/on_failure_jump to %ld",
864 (long)(p + mcnt - start));
867 case on_failure_keep_string_jump:
868 extract_number_and_incr(&mcnt, &p);
869 printf("/on_failure_keep_string_jump to %ld",
870 (long)(p + mcnt - start));
873 case dummy_failure_jump:
874 extract_number_and_incr(&mcnt, &p);
875 printf("/dummy_failure_jump to %ld",
876 (long)(p + mcnt - start));
879 case push_dummy_failure:
880 printf("/push_dummy_failure");
884 extract_number_and_incr(&mcnt, &p);
885 printf("/maybe_pop_jump to %ld",
886 (long)(p + mcnt - start));
889 case pop_failure_jump:
890 extract_number_and_incr(&mcnt, &p);
891 printf("/pop_failure_jump to %ld",
892 (long)(p + mcnt - start));
896 extract_number_and_incr(&mcnt, &p);
897 printf("/jump_past_alt to %ld",
898 (long)(p + mcnt - start));
902 extract_number_and_incr(&mcnt, &p);
903 printf("/jump to %ld", (long)(p + mcnt - start));
907 extract_number_and_incr(&mcnt, &p);
908 extract_number_and_incr(&mcnt2, &p);
909 printf("/succeed_n to %ld, %d times",
910 (long)(p + mcnt - start), mcnt2);
914 extract_number_and_incr(&mcnt, &p);
915 extract_number_and_incr(&mcnt2, &p);
916 printf("/jump_n to %ld, %d times",
917 (long)(p + mcnt - start), mcnt2);
921 extract_number_and_incr(&mcnt, &p);
922 extract_number_and_incr(&mcnt2, &p);
923 printf("/set_number_at location %ld to %d",
924 (long)(p + mcnt - start), mcnt2);
928 printf("/wordbound");
932 printf("/notwordbound");
944 printf("/before_dot");
952 printf("/after_dot");
956 printf("/syntaxspec");
962 printf("/notsyntaxspec");
968 /* 97/2/17 jhod Mule category patch */
970 printf("/categoryspec");
975 case notcategoryspec:
976 printf("/notcategoryspec");
980 /* end of category patch */
989 printf("/notwordchar");
1002 printf("?%d", *(p - 1));
1008 printf("%ld:\tend of pattern.\n", (long)(p - start));
1012 print_compiled_pattern(struct re_pattern_buffer *bufp)
1016 re_char *buffer = bufp->buffer;
1018 unsigned char *buffer = bufp->buffer;
1021 print_partial_compiled_pattern(buffer, buffer + bufp->used);
1022 printf("%ld bytes used/%ld bytes allocated.\n", bufp->used,
1025 if (bufp->fastmap_accurate && bufp->fastmap) {
1026 printf("fastmap: ");
1027 print_fastmap(bufp->fastmap);
1030 printf("re_nsub: %ld\t", (long)bufp->re_nsub);
1031 printf("re_ngroups: %ld\t", (long)bufp->re_ngroups);
1032 printf("regs_alloc: %d\t", bufp->regs_allocated);
1033 printf("can_be_null: %d\t", bufp->can_be_null);
1034 printf("newline_anchor: %d\n", bufp->newline_anchor);
1035 printf("no_sub: %d\t", bufp->no_sub);
1036 printf("not_bol: %d\t", bufp->not_bol);
1037 printf("not_eol: %d\t", bufp->not_eol);
1038 printf("syntax: %d\n", bufp->syntax);
1039 /* Perhaps we should print the translate table? */
1040 /* and maybe the category table? */
1042 if (bufp->external_to_internal_register) {
1045 printf("external_to_internal_register:\n");
1046 for (i = 0; i <= bufp->re_nsub; i++) {
1049 printf("%d -> %d", i,
1050 bufp->external_to_internal_register[i]);
1057 print_double_string(re_char * where, re_char * string1, int size1,
1058 re_char * string2, int size2)
1063 Element_count this_char;
1065 if (FIRST_STRING_P(where)) {
1066 for (this_char = where - string1; this_char < size1;
1068 putchar(string1[this_char]);
1073 for (this_char = where - string2; this_char < size2;
1075 putchar(string2[this_char]);
1079 #else /* not REGEX_DEBUG_FLAG */
1084 #define DEBUG_STATEMENT(e)
1085 #define DEBUG_PRINT1(x)
1086 #define DEBUG_PRINT2(x1, x2)
1087 #define DEBUG_PRINT3(x1, x2, x3)
1088 #define DEBUG_PRINT4(x1, x2, x3, x4)
1089 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1090 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1092 #endif /* REGEX_DEBUG_FLAG */
1094 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1095 also be assigned to arbitrarily: each pattern buffer stores its own
1096 syntax, so it can be changed between regex compilations. */
1097 /* This has no initializer because initialized variables in Emacs
1098 become read-only after dumping. */
1099 reg_syntax_t re_syntax_options;
1101 /* Specify the precise syntax of regexps for compilation. This provides
1102 for compatibility for various utilities which historically have
1103 different, incompatible syntaxes.
1105 The argument SYNTAX is a bit mask comprised of the various bits
1106 defined in regex.h. We return the old syntax. */
1108 reg_syntax_t re_set_syntax(reg_syntax_t syntax)
1110 reg_syntax_t ret = re_syntax_options;
1112 re_syntax_options = syntax;
1116 /* This table gives an error message for each of the error codes listed
1117 in regex.h. Obviously the order here has to be same as there.
1118 POSIX doesn't require that we do anything for REG_NOERROR,
1119 but why not be nice? */
1121 static const char *re_error_msgid[] = {
1122 "Success", /* REG_NOERROR */
1123 "No match", /* REG_NOMATCH */
1124 "Invalid regular expression", /* REG_BADPAT */
1125 "Invalid collation character", /* REG_ECOLLATE */
1126 "Invalid character class name", /* REG_ECTYPE */
1127 "Trailing backslash", /* REG_EESCAPE */
1128 "Invalid back reference", /* REG_ESUBREG */
1129 "Unmatched [ or [^", /* REG_EBRACK */
1130 "Unmatched ( or \\(", /* REG_EPAREN */
1131 "Unmatched \\{", /* REG_EBRACE */
1132 "Invalid content of \\{\\}", /* REG_BADBR */
1133 "Invalid range end", /* REG_ERANGE */
1134 "Memory exhausted", /* REG_ESPACE */
1135 "Invalid preceding regular expression", /* REG_BADRPT */
1136 "Premature end of regular expression", /* REG_EEND */
1137 "Regular expression too big", /* REG_ESIZE */
1138 "Unmatched ) or \\)", /* REG_ERPAREN */
1140 "Invalid syntax designator", /* REG_ESYNTAX */
1143 "Ranges may not span charsets", /* REG_ERANGESPAN */
1144 "Invalid category designator", /* REG_ECATEGORY */
1148 /* Avoiding alloca during matching, to placate r_alloc. */
1150 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1151 searching and matching functions should not call alloca. On some
1152 systems, alloca is implemented in terms of malloc, and if we're
1153 using the relocating allocator routines, then malloc could cause a
1154 relocation, which might (if the strings being searched are in the
1155 ralloc heap) shift the data out from underneath the regexp
1158 Here's another reason to avoid allocation: Emacs
1159 processes input from X in a signal handler; processing X input may
1160 call malloc; if input arrives while a matching routine is calling
1161 malloc, then we're scrod. But Emacs can't just block input while
1162 calling matching routines; then we don't notice interrupts when
1163 they come in. So, Emacs blocks input around all regexp calls
1164 except the matching calls, which it leaves unprotected, in the
1165 faith that they will not malloc. */
1167 /* Normally, this is fine. */
1168 #define MATCH_MAY_ALLOCATE
1170 /* When using GNU C, we are not REALLY using the C alloca, no matter
1171 what config.h may say. So don't take precautions for it. */
1176 /* The match routines may not allocate if (1) they would do it with malloc
1177 and (2) it's not safe for them to use malloc.
1178 Note that if REL_ALLOC is defined, matching would not use malloc for the
1179 failure stack, but we would still use it for the register vectors;
1180 so REL_ALLOC should not affect this. */
1181 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1182 #undef MATCH_MAY_ALLOCATE
1185 /* Failure stack declarations and macros; both re_compile_fastmap and
1186 re_match_2 use a failure stack. These have to be macros because of
1187 REGEX_ALLOCATE_STACK. */
1189 /* Number of failure points for which to initially allocate space
1190 when matching. If this number is exceeded, we allocate more
1191 space, so it is not a hard limit. */
1192 #ifndef INIT_FAILURE_ALLOC
1193 #define INIT_FAILURE_ALLOC 20
1196 /* Roughly the maximum number of failure points on the stack. Would be
1197 exactly that if always used MAX_FAILURE_SPACE each time we failed.
1198 This is a variable only so users of regex can assign to it; we never
1199 change it ourselves. */
1200 #if defined (MATCH_MAY_ALLOCATE) || defined (REGEX_MALLOC)
1201 /* 4400 was enough to cause a crash on Alpha OSF/1,
1202 whose default stack limit is 2mb. */
1203 int re_max_failures = 50000;
1205 int re_max_failures = 2000;
1208 union fail_stack_elt {
1209 const unsigned char *pointer;
1213 typedef union fail_stack_elt fail_stack_elt_t;
1216 fail_stack_elt_t *stack;
1218 Element_count avail; /* Offset of next open position. */
1221 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1222 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1223 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1225 /* Define macros to initialize and free the failure stack.
1226 Do `return -2' if the alloc fails. */
1228 #ifdef MATCH_MAY_ALLOCATE
1229 #define INIT_FAIL_STACK() \
1231 fail_stack.stack = (fail_stack_elt_t *) \
1232 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * \
1233 sizeof (fail_stack_elt_t)); \
1235 if (fail_stack.stack == NULL) \
1238 fail_stack.size = INIT_FAILURE_ALLOC; \
1239 fail_stack.avail = 0; \
1242 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1244 #define INIT_FAIL_STACK() \
1246 fail_stack.avail = 0; \
1249 #define RESET_FAIL_STACK()
1252 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1254 Return 1 if succeeds, and 0 if either ran out of memory
1255 allocating space for it or it was already too large.
1257 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1259 #define DOUBLE_FAIL_STACK(fail_stack) \
1260 ((int) (fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
1262 : ((fail_stack).stack = (fail_stack_elt_t *) \
1263 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1264 (fail_stack).size * sizeof (fail_stack_elt_t), \
1265 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1267 (fail_stack).stack == NULL \
1269 : ((fail_stack).size <<= 1, \
1272 /* Push pointer POINTER on FAIL_STACK.
1273 Return 1 if was able to do so and 0 if ran out of memory allocating
1275 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1276 ((FAIL_STACK_FULL () \
1277 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1279 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1282 /* Push a pointer value onto the failure stack.
1283 Assumes the variable `fail_stack'. Probably should only
1284 be called from within `PUSH_FAILURE_POINT'. */
1285 #define PUSH_FAILURE_POINTER(item) \
1286 fail_stack.stack[fail_stack.avail++].pointer = \
1287 (const unsigned char*)(item)
1289 /* This pushes an integer-valued item onto the failure stack.
1290 Assumes the variable `fail_stack'. Probably should only
1291 be called from within `PUSH_FAILURE_POINT'. */
1292 #define PUSH_FAILURE_INT(item) \
1293 fail_stack.stack[fail_stack.avail++].integer = (item)
1295 /* Push a fail_stack_elt_t value onto the failure stack.
1296 Assumes the variable `fail_stack'. Probably should only
1297 be called from within `PUSH_FAILURE_POINT'. */
1298 #define PUSH_FAILURE_ELT(item) \
1299 fail_stack.stack[fail_stack.avail++] = (item)
1301 /* These three POP... operations complement the three PUSH... operations.
1302 All assume that `fail_stack' is nonempty. */
1303 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1304 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1305 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1307 /* Used to omit pushing failure point id's when we're not debugging. */
1308 #ifdef REGEX_DEBUG_FLAG
1309 #define DEBUG_PUSH PUSH_FAILURE_INT
1310 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1312 #define DEBUG_PUSH(item)
1313 #define DEBUG_POP(item_addr)
1316 /* Push the information about the state we will need
1317 if we ever fail back to it.
1319 Requires variables fail_stack, regstart, regend, reg_info, and
1320 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
1323 Does `return FAILURE_CODE' if runs out of memory. */
1325 #if !defined (REGEX_MALLOC) && !defined (REL_ALLOC)
1326 #define DECLARE_DESTINATION char *destination
1328 #define DECLARE_DESTINATION DECLARE_NOTHING
1331 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1333 DECLARE_DESTINATION; \
1334 /* Must be int, so when we don't save any registers, the\
1335 arithmetic \ of 0 + -1 isn't done as unsigned. */ \
1338 DEBUG_STATEMENT (failure_id++); \
1339 DEBUG_STATEMENT (nfailure_points_pushed++); \
1340 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1341 DEBUG_PRINT2 (" Before push, next avail: %lu\n", \
1342 (long unsigned int)(fail_stack).avail); \
1343 DEBUG_PRINT2 (" size: %lu\n", \
1344 (long unsigned int)(fail_stack).size); \
1346 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
1347 DEBUG_PRINT2 (" available: %ld\n", \
1348 (long) REMAINING_AVAIL_SLOTS); \
1350 /* Ensure we have enough space allocated for what \
1351 * we will push. */ \
1352 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) { \
1353 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1354 return failure_code; \
1356 DEBUG_PRINT2 ("\n Doubled stack; size now: %lu\n", \
1357 (unsigned long) (fail_stack).size); \
1358 DEBUG_PRINT2 (" slots available: %ld\n", \
1359 (long) REMAINING_AVAIL_SLOTS); \
1362 /* Push the info, starting with the registers. */ \
1363 DEBUG_PRINT1 ("\n"); \
1365 for (this_reg = lowest_active_reg; \
1366 this_reg <= highest_active_reg; \
1368 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1369 DEBUG_STATEMENT (num_regs_pushed++); \
1371 DEBUG_PRINT2(" start: %p\n", \
1372 regstart[this_reg]); \
1373 PUSH_FAILURE_POINTER(regstart[this_reg]); \
1375 DEBUG_PRINT2 (" end: %p\n", \
1376 regend[this_reg]); \
1377 PUSH_FAILURE_POINTER(regend[this_reg]); \
1379 DEBUG_PRINT2 (" info: 0x%p\n ", \
1380 ®_info[this_reg]); \
1381 DEBUG_PRINT2 (" match_null=%d", \
1382 REG_MATCH_NULL_STRING_P \
1383 (reg_info[this_reg])); \
1384 DEBUG_PRINT2 (" active=%d", \
1385 IS_ACTIVE (reg_info[this_reg])); \
1386 DEBUG_PRINT2 (" matched_something=%d", \
1388 (reg_info[this_reg])); \
1389 DEBUG_PRINT2 (" ever_matched_something=%d", \
1390 EVER_MATCHED_SOMETHING \
1391 (reg_info[this_reg])); \
1392 DEBUG_PRINT1 ("\n"); \
1393 PUSH_FAILURE_ELT(reg_info[this_reg].word); \
1396 DEBUG_PRINT2 (" Pushing low active reg: %d\n", \
1397 lowest_active_reg); \
1398 PUSH_FAILURE_INT (lowest_active_reg); \
1400 DEBUG_PRINT2 (" Pushing high active reg: %d\n", \
1401 highest_active_reg); \
1402 PUSH_FAILURE_INT (highest_active_reg); \
1404 DEBUG_PRINT2 (" Pushing pattern 0x%lx: \n", \
1405 (long) pattern_place); \
1406 DEBUG_PRINT_COMPILED_PATTERN(bufp, pattern_place, pend); \
1407 PUSH_FAILURE_POINTER (pattern_place); \
1409 DEBUG_PRINT2(" Pushing string %p: `", \
1411 DEBUG_PRINT_DOUBLE_STRING(string_place, \
1414 DEBUG_PRINT1("'\n"); \
1415 PUSH_FAILURE_POINTER (string_place); \
1417 DEBUG_PRINT2(" Pushing failure id: %u\n", failure_id); \
1418 DEBUG_PUSH(failure_id); \
1421 /* This is the number of items that are pushed and popped on the stack
1422 for each register. */
1423 #define NUM_REG_ITEMS 3
1425 /* Individual items aside from the registers. */
1427 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1429 #define NUM_NONREG_ITEMS 4
1432 /* We push at most this many items on the stack. */
1433 /* We used to use (num_regs - 1), which is the number of registers
1434 this regexp will save; but that was changed to 5
1435 to avoid stack overflow for a regexp with lots of parens. */
1436 #define MAX_FAILURE_ITEMS (5 * (NUM_REG_ITEMS + NUM_NONREG_ITEMS))
1438 /* We actually push this many items. */
1439 #define NUM_FAILURE_ITEMS \
1440 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
1443 /* How many items can still be added to the stack without overflowing it. */
1444 #define REMAINING_AVAIL_SLOTS ((int) ((fail_stack).size - (fail_stack).avail))
1446 /* Pops what PUSH_FAIL_STACK pushes.
1448 We restore into the parameters, all of which should be lvalues:
1449 STR -- the saved data position.
1450 PAT -- the saved pattern position.
1451 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1452 REGSTART, REGEND -- arrays of string positions.
1453 REG_INFO -- array of information about each subexpression.
1455 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1456 `pend', `string1', `size1', `string2', and `size2'. */
1458 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, \
1459 regstart, regend, reg_info) \
1461 DEBUG_STATEMENT (fail_stack_elt_t ffailure_id;) \
1463 const unsigned char *string_temp; \
1465 assert (!FAIL_STACK_EMPTY ()); \
1467 /* Remove failure points and point to \
1468 * how many regs pushed. */ \
1469 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1470 DEBUG_PRINT2 (" Before pop, next avail: %lu\n", \
1471 (unsigned long) fail_stack.avail); \
1472 DEBUG_PRINT2 (" size: %lu\n", \
1473 (unsigned long) fail_stack.size); \
1475 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1477 DEBUG_POP (&ffailure_id.integer); \
1478 DEBUG_PRINT2 (" Popping failure id: %u\n", \
1479 * (unsigned int *) &ffailure_id); \
1481 /* If the saved string location is NULL, it \
1482 came from an on_failure_keep_string_jump opcode, \
1483 and we want to throw away the saved NULL, thus \
1484 retaining our current position in the string. */ \
1485 string_temp = POP_FAILURE_POINTER (); \
1486 if (string_temp != NULL) \
1487 str = string_temp; \
1489 DEBUG_PRINT2(" Popping string %p: `", str); \
1490 DEBUG_PRINT_DOUBLE_STRING(str, \
1493 DEBUG_PRINT1("'\n"); \
1495 pat = (unsigned char *) POP_FAILURE_POINTER(); \
1496 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1497 DEBUG_PRINT_COMPILED_PATTERN(bufp, pat, pend); \
1499 /* Restore register info. */ \
1500 high_reg = (unsigned) POP_FAILURE_INT (); \
1501 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1503 low_reg = (unsigned) POP_FAILURE_INT (); \
1504 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1506 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) { \
1507 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
1509 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1510 DEBUG_PRINT2 (" info: %p\n", \
1511 ®_info[this_reg]); \
1513 regend[this_reg] = POP_FAILURE_POINTER (); \
1514 DEBUG_PRINT2 (" end: %p\n", \
1515 regend[this_reg]); \
1517 regstart[this_reg] = POP_FAILURE_POINTER (); \
1518 DEBUG_PRINT2 (" start: %p\n", \
1519 regstart[this_reg]); \
1522 set_regs_matched_done = 0; \
1523 DEBUG_STATEMENT (nfailure_points_popped++); \
1524 } while (0) /* POP_FAILURE_POINT */
1526 /* Structure for per-register (a.k.a. per-group) information.
1527 Other register information, such as the
1528 starting and ending positions (which are addresses), and the list of
1529 inner groups (which is a bits list) are maintained in separate
1532 We are making a (strictly speaking) nonportable assumption here: that
1533 the compiler will pack our bit fields into something that fits into
1534 the type of `word', i.e., is something that fits into one item on the
1538 fail_stack_elt_t word;
1540 /* This field is one if this group can match the empty string,
1541 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1542 #define MATCH_NULL_UNSET_VALUE 3
1543 unsigned match_null_string_p:2;
1544 unsigned is_active:1;
1545 unsigned matched_something:1;
1546 unsigned ever_matched_something:1;
1548 } register_info_type;
1550 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1551 #define IS_ACTIVE(R) ((R).bits.is_active)
1552 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1553 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1555 /* Call this when have matched a real character; it sets `matched' flags
1556 for the subexpressions which we are currently inside. Also records
1557 that those subexprs have matched. */
1558 #define SET_REGS_MATCHED() \
1561 if (!set_regs_matched_done) \
1564 set_regs_matched_done = 1; \
1565 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1567 MATCHED_SOMETHING (reg_info[r]) \
1568 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1575 /* Registers are set to a sentinel when they haven't yet matched. */
1576 static unsigned char reg_unset_dummy;
1577 #define REG_UNSET_VALUE (®_unset_dummy)
1578 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1580 /* Subroutine declarations and macros for regex_compile. */
1582 /* Fetch the next character in the uncompiled pattern---translating it
1583 if necessary. Also cast from a signed character in the constant
1584 string passed to us by the user to an unsigned char that we can use
1585 as an array index (in, e.g., `translate'). */
1586 #define PATFETCH(c) \
1589 c = TRANSLATE (c); \
1592 /* Fetch the next character in the uncompiled pattern, with no
1594 #define PATFETCH_RAW(c) \
1595 do {if (p == pend) return REG_EEND; \
1596 assert (p < pend); \
1597 c = charptr_emchar (p); \
1601 /* Go backwards one character in the pattern. */
1602 #define PATUNFETCH DEC_CHARPTR (p)
1606 #define PATFETCH_EXTENDED(emch) \
1607 do {if (p == pend) return REG_EEND; \
1608 assert (p < pend); \
1609 emch = charptr_emchar ((const Bufbyte *) p); \
1611 if (TRANSLATE_P (translate) && emch < 0x80) \
1612 emch = (Emchar) (unsigned char) RE_TRANSLATE (emch); \
1615 #define PATFETCH_RAW_EXTENDED(emch) \
1616 do {if (p == pend) return REG_EEND; \
1617 assert (p < pend); \
1618 emch = charptr_emchar ((const Bufbyte *) p); \
1622 #define PATUNFETCH_EXTENDED DEC_CHARPTR (p)
1624 #define PATFETCH_EITHER(emch) \
1626 if (has_extended_chars) \
1627 PATFETCH_EXTENDED (emch); \
1632 #define PATFETCH_RAW_EITHER(emch) \
1634 if (has_extended_chars) \
1635 PATFETCH_RAW_EXTENDED (emch); \
1637 PATFETCH_RAW (emch); \
1640 #define PATUNFETCH_EITHER \
1642 if (has_extended_chars) \
1643 PATUNFETCH_EXTENDED (emch); \
1645 PATUNFETCH (emch); \
1648 #else /* not MULE */
1650 #define PATFETCH_EITHER(emch) PATFETCH (emch)
1651 #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch)
1652 #define PATUNFETCH_EITHER PATUNFETCH
1656 /* If `translate' is non-null, return translate[D], else just D. We
1657 cast the subscript to translate because some data is declared as
1658 `char *', to avoid warnings when a string constant is passed. But
1659 when we use a character as a subscript we must make it unsigned. */
1660 #define TRANSLATE(d) (TRANSLATE_P (translate) ? RE_TRANSLATE (d) : (d))
1662 /* Macros for outputting the compiled pattern into `buffer'. */
1664 /* If the buffer isn't allocated when it comes in, use this. */
1665 #define INIT_BUF_SIZE 32
1667 /* Make sure we have at least N more bytes of space in buffer. */
1668 #define GET_BUFFER_SPACE(n) \
1669 while (buf_end - bufp->buffer + (n) > (ptrdiff_t) bufp->allocated) \
1672 /* Make sure we have one more byte of buffer space and then add C to it. */
1673 #define BUF_PUSH(c) \
1675 GET_BUFFER_SPACE (1); \
1676 *buf_end++ = (unsigned char) (c); \
1679 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1680 #define BUF_PUSH_2(c1, c2) \
1682 GET_BUFFER_SPACE (2); \
1683 *buf_end++ = (unsigned char) (c1); \
1684 *buf_end++ = (unsigned char) (c2); \
1687 /* As with BUF_PUSH_2, except for three bytes. */
1688 #define BUF_PUSH_3(c1, c2, c3) \
1690 GET_BUFFER_SPACE (3); \
1691 *buf_end++ = (unsigned char) (c1); \
1692 *buf_end++ = (unsigned char) (c2); \
1693 *buf_end++ = (unsigned char) (c3); \
1696 /* Store a jump with opcode OP at LOC to location TO. We store a
1697 relative address offset by the three bytes the jump itself occupies. */
1698 #define STORE_JUMP(op, loc, to) \
1699 store_op1 (op, loc, (to) - (loc) - 3)
1701 /* Likewise, for a two-argument jump. */
1702 #define STORE_JUMP2(op, loc, to, arg) \
1703 store_op2 (op, loc, (to) - (loc) - 3, arg)
1705 /* Like `STORE_JUMP', but for inserting. Assume `buf_end' is the
1707 #define INSERT_JUMP(op, loc, to) \
1708 insert_op1 (op, loc, (to) - (loc) - 3, buf_end)
1710 /* Like `STORE_JUMP2', but for inserting. Assume `buf_end' is the
1712 #define INSERT_JUMP2(op, loc, to, arg) \
1713 insert_op2 (op, loc, (to) - (loc) - 3, arg, buf_end)
1715 /* This is not an arbitrary limit: the arguments which represent offsets
1716 into the pattern are two bytes long. So if 2^16 bytes turns out to
1717 be too small, many things would have to change. */
1718 #define MAX_BUF_SIZE (1L << 16)
1720 /* Extend the buffer by twice its current size via realloc and
1721 reset the pointers that pointed into the old block to point to the
1722 correct places in the new one. If extending the buffer results in it
1723 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1724 #define EXTEND_BUFFER() \
1726 re_char *old_buffer = bufp->buffer; \
1727 if (bufp->allocated == MAX_BUF_SIZE) \
1729 bufp->allocated <<= 1; \
1730 if (bufp->allocated > MAX_BUF_SIZE) \
1731 bufp->allocated = MAX_BUF_SIZE; \
1732 bufp->buffer = (unsigned char *)xrealloc( \
1733 bufp->buffer, bufp->allocated); \
1734 if (bufp->buffer == NULL) { \
1735 return REG_ESPACE; \
1737 /* If the buffer moved, move all the pointers into it. */ \
1738 if (old_buffer != bufp->buffer) { \
1739 buf_end = (buf_end - old_buffer) + bufp->buffer; \
1740 begalt = (begalt - old_buffer) + bufp->buffer; \
1741 if (fixup_alt_jump) { \
1743 (fixup_alt_jump - old_buffer) + \
1747 laststart = (laststart - old_buffer) + \
1750 if (pending_exact) { \
1752 (pending_exact - old_buffer) + \
1758 /* Since we have one byte reserved for the register number argument to
1759 {start,stop}_memory, the maximum number of groups we can report
1760 things about is what fits in that byte. */
1761 #define MAX_REGNUM 255
1763 /* But patterns can have more than `MAX_REGNUM' registers. We just
1764 ignore the excess. */
1765 typedef unsigned regnum_t;
1767 #define INIT_REG_TRANSLATE_SIZE 5
1769 /* Macros for the compile stack. */
1771 /* Since offsets can go either forwards or backwards, this type needs to
1772 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1773 typedef int pattern_offset_t;
1776 pattern_offset_t begalt_offset;
1777 pattern_offset_t fixup_alt_jump;
1778 pattern_offset_t inner_group_offset;
1779 pattern_offset_t laststart_offset;
1781 } compile_stack_elt_t;
1784 compile_stack_elt_t *stack;
1786 unsigned avail; /* Offset of next open position. */
1787 } compile_stack_type;
1789 #define INIT_COMPILE_STACK_SIZE 32
1791 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1792 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1794 /* The next available element. */
1795 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1797 /* Set the bit for character C in a bit vector. */
1798 #define SET_LIST_BIT(c) \
1799 (buf_end[((unsigned char) (c)) / BYTEWIDTH] \
1800 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1804 /* Set the "bit" for character C in a range table. */
1805 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt)
1807 /* Set the "bit" for character c in the appropriate table. */
1808 #define SET_EITHER_BIT(c) \
1810 if (has_extended_chars) \
1811 SET_RANGETAB_BIT (c); \
1816 #else /* not MULE */
1818 #define SET_EITHER_BIT(c) SET_LIST_BIT (c)
1822 /* Get the next unsigned number in the uncompiled pattern. */
1823 #define GET_UNSIGNED_NUMBER(num) \
1827 while (ISDIGIT (c)) \
1831 num = num * 10 + c - '0'; \
1839 #define CHAR_CLASS_MAX_LENGTH 9 /* Namely, `multibyte'. */
1842 static void store_op1(re_opcode_t op, unsigned char *loc, int arg);
1843 static void store_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1844 static void insert_op1(re_opcode_t op, unsigned char *loc, int arg,
1845 unsigned char *end);
1846 static void insert_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2,
1847 unsigned char *end);
1848 static re_bool at_begline_loc_p(re_char*, re_char*, reg_syntax_t);
1849 static re_bool at_endline_loc_p(re_char*, re_char*, int syntax);
1850 static re_bool group_in_compile_stack(compile_stack_type compile_stack,
1852 static reg_errcode_t compile_range(re_char ** p_ptr, re_char * pend,
1853 RE_TRANSLATE_TYPE translate,
1854 reg_syntax_t syntax, unsigned char *b);
1856 static reg_errcode_t compile_extended_range(re_char ** p_ptr,
1858 RE_TRANSLATE_TYPE translate,
1859 reg_syntax_t syntax,
1862 static re_bool group_match_null_string_p(unsigned char **p,
1864 register_info_type * reg_info);
1865 static re_bool alt_match_null_string_p(unsigned char *p, unsigned char *end,
1866 register_info_type * reg_info);
1867 static re_bool common_op_match_null_string_p(unsigned char **p,
1869 register_info_type * reg_info);
1870 static int bcmp_translate(const unsigned char *s1, const unsigned char *s2,
1871 REGISTER int len, RE_TRANSLATE_TYPE translate);
1872 static int re_match_2_internal(struct re_pattern_buffer *bufp,
1873 re_char * string1, int size1,
1874 re_char * string2, int size2, int pos,
1875 struct re_registers *regs, int stop);
1877 #ifndef MATCH_MAY_ALLOCATE
1879 /* If we cannot allocate large objects within re_match_2_internal,
1880 we make the fail stack and register vectors global.
1881 The fail stack, we grow to the maximum size when a regexp
1883 The register vectors, we adjust in size each time we
1884 compile a regexp, according to the number of registers it needs. */
1886 static fail_stack_type fail_stack;
1888 /* Size with which the following vectors are currently allocated.
1889 That is so we can make them bigger as needed,
1890 but never make them smaller. */
1891 static int regs_allocated_size;
1893 static re_char **regstart, **regend;
1894 static re_char **old_regstart, **old_regend;
1895 static re_char **best_regstart, **best_regend;
1896 static register_info_type *reg_info;
1897 static re_char **reg_dummy;
1898 static register_info_type *reg_info_dummy;
1900 /* Make the register vectors big enough for NUM_REGS registers,
1901 but don't make them smaller. */
1903 static void regex_grow_registers(int num_regs)
1905 if (num_regs > regs_allocated_size) {
1906 RETALLOC_IF(regstart, num_regs, re_char *);
1907 RETALLOC_IF(regend, num_regs, re_char *);
1908 RETALLOC_IF(old_regstart, num_regs, re_char *);
1909 RETALLOC_IF(old_regend, num_regs, re_char *);
1910 RETALLOC_IF(best_regstart, num_regs, re_char *);
1911 RETALLOC_IF(best_regend, num_regs, re_char *);
1912 RETALLOC_IF(reg_info, num_regs, register_info_type);
1913 RETALLOC_IF(reg_dummy, num_regs, re_char *);
1914 RETALLOC_IF(reg_info_dummy, num_regs, register_info_type);
1916 regs_allocated_size = num_regs;
1920 #endif /* not MATCH_MAY_ALLOCATE */
1922 /* auxiliary stuff, FSF theft */
1923 /* Character classes. */
1926 RECC_ALNUM, RECC_ALPHA, RECC_WORD,
1927 RECC_GRAPH, RECC_PRINT,
1928 RECC_LOWER, RECC_UPPER,
1929 RECC_PUNCT, RECC_CNTRL,
1930 RECC_DIGIT, RECC_XDIGIT,
1931 RECC_BLANK, RECC_SPACE,
1932 RECC_MULTIBYTE, RECC_NONASCII,
1933 RECC_ASCII, RECC_UNIBYTE
1936 /* Map a string to the char class it names (if any). */
1937 static inline re_wctype_t
1938 re_wctype(re_char *str)
1940 const char *string = (const char*)str;
1942 if (STREQ (string, "alnum")) {
1944 } else if (STREQ (string, "alpha")) {
1946 } else if (STREQ (string, "digit")) {
1948 } else if (STREQ (string, "graph")) {
1950 } else if (STREQ (string, "space")) {
1952 } else if (STREQ (string, "upper")) {
1954 } else if (STREQ (string, "lower")) {
1956 } else if (STREQ (string, "word")) {
1958 } else if (STREQ (string, "print")) {
1960 } else if (STREQ (string, "punct")) {
1962 } else if (STREQ (string, "xdigit")) {
1964 } else if (STREQ (string, "cntrl")) {
1966 } else if (STREQ (string, "blank")) {
1968 } else if (STREQ (string, "ascii")) {
1970 } else if (STREQ (string, "nonascii")) {
1971 return RECC_NONASCII;
1972 } else if (STREQ (string, "unibyte")) {
1973 return RECC_UNIBYTE;
1974 } else if (STREQ (string, "multibyte")) {
1975 return RECC_MULTIBYTE;
1981 /* True if CH is in the char class CC. */
1983 re_iswctype(int ch, re_wctype_t cc)
1987 return ISALNUM (ch);
1989 return ISALPHA (ch);
1991 return ISBLANK (ch);
1993 return ISCNTRL (ch);
1995 return ISDIGIT (ch);
1997 return ISGRAPH (ch);
1999 return ISLOWER (ch);
2001 return ISPRINT (ch);
2003 return ISPUNCT (ch);
2005 return ISSPACE (ch);
2007 return ISUPPER (ch);
2009 return ISXDIGIT (ch);
2011 return IS_REAL_ASCII (ch);
2013 return !IS_REAL_ASCII (ch);
2017 return ISUNIBYTE((unsigned int)ch);
2018 case RECC_MULTIBYTE:
2019 return !ISUNIBYTE((unsigned int)ch);
2029 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2030 Returns one of error codes defined in `regex.h', or zero for success.
2032 Assumes the `allocated' (and perhaps `buffer') and `translate'
2033 fields are set in BUFP on entry.
2035 If it succeeds, results are put in BUFP (if it returns an error, the
2036 contents of BUFP are undefined):
2037 `buffer' is the compiled pattern;
2038 `syntax' is set to SYNTAX;
2039 `used' is set to the length of the compiled pattern;
2040 `fastmap_accurate' is zero;
2041 `re_ngroups' is the number of groups/subexpressions (including shy
2043 `re_nsub' is the number of non-shy groups in PATTERN;
2044 `not_bol' and `not_eol' are zero;
2046 The `fastmap' and `newline_anchor' fields are neither
2047 examined nor set. */
2049 static reg_errcode_t
2050 regex_compile(re_char * pattern, int size, reg_syntax_t syntax,
2051 struct re_pattern_buffer *bufp)
2053 /* We fetch characters from PATTERN here. We declare these as int
2054 (or possibly long) so that chars above 127 can be used as
2055 array indices. The macros that fetch a character from the pattern
2056 make sure to coerce to unsigned char before assigning, so we won't
2057 get bitten by negative numbers here. */
2058 /* XEmacs change: used to be unsigned char. */
2059 REGISTER EMACS_INT c, c1;
2061 /* A random temporary spot in PATTERN. */
2064 /* Points to the end of the buffer, where we should append. */
2065 REGISTER unsigned char *buf_end;
2067 /* Keeps track of unclosed groups. */
2068 compile_stack_type compile_stack;
2070 /* Points to the current (ending) position in the pattern. */
2071 re_char *p = pattern;
2072 re_char *pend = pattern + size;
2074 /* How to translate the characters in the pattern. */
2075 RE_TRANSLATE_TYPE translate = bufp->translate;
2077 /* Address of the count-byte of the most recently inserted `exactn'
2078 command. This makes it possible to tell if a new exact-match
2079 character can be added to that command or if the character requires
2080 a new `exactn' command. */
2081 unsigned char *pending_exact = 0;
2083 /* Address of start of the most recently finished expression.
2084 This tells, e.g., postfix * where to find the start of its
2085 operand. Reset at the beginning of groups and alternatives. */
2086 unsigned char *laststart = 0;
2088 /* Address of beginning of regexp, or inside of last group. */
2089 unsigned char *begalt;
2091 /* Place in the uncompiled pattern (i.e., the {) to
2092 which to go back if the interval is invalid. */
2093 re_char *beg_interval;
2095 /* Address of the place where a forward jump should go to the end of
2096 the containing expression. Each alternative of an `or' -- except the
2097 last -- ends with a forward jump of this sort. */
2098 unsigned char *fixup_alt_jump = 0;
2100 /* Counts open-groups as they are encountered. Remembered for the
2101 matching close-group on the compile stack, so the same register
2102 number is put in the stop_memory as the start_memory. */
2103 regnum_t regnum = 0;
2105 /* we need to close over compile_stack */
2106 # define free_stack(args...) xfree(compile_stack.stack)
2108 #ifdef REGEX_DEBUG_FLAG
2109 DEBUG_PRINT1("\nCompiling pattern: ");
2113 for (debug_count = 0; debug_count < size; debug_count++)
2114 putchar(pattern[debug_count]);
2119 /* Initialize the compile stack. */
2120 compile_stack.stack =
2121 TALLOC(INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2122 if (compile_stack.stack == NULL) {
2125 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2126 compile_stack.avail = 0;
2128 /* Initialize the pattern buffer. */
2129 bufp->syntax = syntax;
2130 bufp->fastmap_accurate = 0;
2131 bufp->not_bol = bufp->not_eol = 0;
2133 /* Set `used' to zero, so that if we return an error, the pattern
2134 printer (for debugging) will think there's no pattern. We reset it
2138 /* Always count groups, whether or not bufp->no_sub is set. */
2140 bufp->re_ngroups = 0;
2142 /* Allocate index translation array if needed. */
2143 if (bufp->external_to_internal_register == 0) {
2144 bufp->external_to_internal_register_size =
2145 INIT_REG_TRANSLATE_SIZE;
2146 RETALLOC(bufp->external_to_internal_register,
2147 bufp->external_to_internal_register_size, int);
2149 /* Initialize translations to impossible value to aid debugging. */
2153 bufp->external_to_internal_register[0] = 0;
2154 for (i = 1; i < bufp->external_to_internal_register_size; i++)
2155 bufp->external_to_internal_register[i] =
2159 #if !defined (emacs) && !defined (SYNTAX_TABLE)
2160 /* Initialize the syntax table. */
2164 if (bufp->allocated == 0) {
2166 /* If zero allocated, but buffer is non-null, try to
2167 realloc enough space. This loses if buffer's address
2168 is bogus, but that is the user's responsibility. */
2169 RETALLOC(bufp->buffer, INIT_BUF_SIZE, unsigned char);
2171 /* Caller did not allocate a buffer. Do it for them. */
2172 bufp->buffer = TALLOC(INIT_BUF_SIZE, unsigned char);
2174 if (!bufp->buffer) {
2179 bufp->allocated = INIT_BUF_SIZE;
2182 begalt = buf_end = bufp->buffer;
2184 /* Loop through the uncompiled pattern until we're at the end. */
2190 if ( /* If at start of pattern, it's an operator. */
2192 /* If context independent, it's an operator. */
2193 || syntax & RE_CONTEXT_INDEP_ANCHORS
2194 /* Otherwise, depends on what's come before. */
2195 || at_begline_loc_p(pattern, p, syntax))
2203 if ( /* If at end of pattern, it's an
2206 /* If context independent, it's an
2208 || syntax & RE_CONTEXT_INDEP_ANCHORS
2209 /* Otherwise, depends on what's
2211 || at_endline_loc_p(p, pend, syntax))
2220 if ((syntax & RE_BK_PLUS_QM) ||
2221 (syntax & RE_LIMITED_OPS)) {
2227 re_bool zero_times_ok;
2228 re_bool many_times_ok;
2231 /* If there is no previous pattern... */
2233 if (syntax & RE_CONTEXT_INVALID_OPS) {
2236 } else if (!(syntax & RE_CONTEXT_INDEP_OPS)) {
2241 /* true means zero/many matches are allowed. */
2242 zero_times_ok = c != '+';
2243 many_times_ok = c != '?';
2245 /* true means match shortest string possible. */
2248 /* If there is a sequence of repetition chars, collapse it
2249 down to just one (the right one). We can't combine
2250 interval operators with these because of, e.g., `a{2}*',
2251 which should only match an even number of `a's. */
2256 || (!(syntax & RE_BK_PLUS_QM)
2257 && (c == '+' || c == '?'))) ;
2259 else if (syntax & RE_BK_PLUS_QM
2267 if (!(c1 == '+' || c1 == '?')) {
2279 /* If we get here, we found another repeat
2281 if (!(syntax & RE_NO_MINIMAL_MATCHING)) {
2282 /* "*?" and "+?" and "??" are okay (and
2283 mean match minimally), but other
2284 sequences (such as "*??" and "+++")
2285 are rejected (reserved for future
2287 if (minimal || c != '?') {
2293 zero_times_ok |= c != '+';
2294 many_times_ok |= c != '?';
2298 /* Star, etc. applied to an empty pattern is equivalent
2299 to an empty pattern. */
2304 /* Now we know whether zero matches is allowed
2305 and whether two or more matches is allowed
2306 and whether we want minimal or maximal matching. */
2308 if (!many_times_ok) {
2310 0: /on_failure_jump to 6
2315 GET_BUFFER_SPACE(6);
2316 INSERT_JUMP(jump, laststart,
2319 INSERT_JUMP(on_failure_jump,
2323 } else if (zero_times_ok) {
2327 6: /on_failure_jump to 3
2330 GET_BUFFER_SPACE(6);
2331 INSERT_JUMP(jump, laststart,
2334 STORE_JUMP(on_failure_jump,
2341 3: /on_failure_jump to 0
2344 GET_BUFFER_SPACE(3);
2345 STORE_JUMP(on_failure_jump,
2346 buf_end, laststart);
2350 /* Are we optimizing this jump? */
2351 re_bool keep_string_p = false;
2353 if (many_times_ok) {
2354 /* More than one repetition is allowed,
2355 so put in at the end a backward
2356 relative jump from `buf_end' to
2357 before the next jump we're going to
2358 put in below (which jumps from
2359 laststart to after this jump).
2361 But if we are at the `*' in the exact
2362 sequence `.*\n', insert an
2363 unconditional jump backwards to the
2364 ., instead of the beginning of the
2365 loop. This way we only push a
2366 failure point once, instead of every
2367 time through the loop. */
2368 assert(p - 1 > pattern);
2370 /* Allocate the space for the jump. */
2371 GET_BUFFER_SPACE(3);
2373 /* We know we are not at the first
2374 character of the pattern, because
2375 laststart was nonzero. And we've
2376 already incremented `p', by the way,
2377 to be the character after the `*'.
2378 Do we have to do something analogous
2379 here for null bytes, because of
2381 if (*(p - 2) == '.' &&
2385 !(syntax & RE_DOT_NEWLINE)) {
2390 keep_string_p = true;
2392 /* Anything else. */
2398 /* We've added more stuff to the
2403 /* On failure, jump from laststart to buf_end +
2404 3, which will be the end of the buffer after
2405 this jump is inserted. */
2406 GET_BUFFER_SPACE(3);
2407 INSERT_JUMP(keep_string_p ?
2408 on_failure_keep_string_jump
2410 laststart, buf_end + 3);
2413 if (!zero_times_ok) {
2414 /* At least one repetition is required,
2415 so insert a `dummy_failure_jump'
2416 before the initial `on_failure_jump'
2417 instruction of the loop. This effects
2418 a skip over that instruction the
2419 first time we hit that loop. */
2420 GET_BUFFER_SPACE(3);
2421 INSERT_JUMP(dummy_failure_jump,
2432 laststart = buf_end;
2437 /* XEmacs change: this whole section */
2438 re_bool had_char_class = false;
2440 re_bool has_extended_chars = false;
2441 REGISTER Lisp_Object rtab = Qnil;
2449 /* Ensure that we have enough space to push a charset:
2450 the opcode, the length count, and the bitset; 34
2452 GET_BUFFER_SPACE(34);
2454 laststart = buf_end;
2456 /* We test `*p == '^' twice, instead of using an if
2457 statement, so we only need one BUF_PUSH. */
2458 BUF_PUSH(*p == '^' ? charset_not : charset);
2462 /* Remember the first position in the bracket
2466 /* Push the number of bytes in the bitmap. */
2467 BUF_PUSH((1 << BYTEWIDTH) / BYTEWIDTH);
2469 /* Clear the whole map. */
2470 memset(buf_end, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2472 /* charset_not matches newline according to a syntax
2474 if ((re_opcode_t) buf_end[-2] == charset_not
2475 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2479 start_over_with_extended:
2480 if (has_extended_chars) {
2481 /* There are extended chars here, which means we
2482 need to start over and shift to unified
2483 range-table format. */
2484 if (buf_end[-2] == charset)
2485 buf_end[-2] = charset_mule;
2487 buf_end[-2] = charset_mule_not;
2490 /* go back to the beginning of the charset,
2491 after a possible ^. */
2492 rtab = Vthe_lisp_rangetab;
2493 Fclear_range_table(rtab);
2495 /* charset_not matches newline according to a
2497 if ((re_opcode_t) buf_end[-1] ==
2500 RE_HAT_LISTS_NOT_NEWLINE))
2501 SET_EITHER_BIT('\n');
2505 /* Read in characters and ranges, setting map bits. */
2514 if (c >= 0x80 && !has_extended_chars) {
2515 has_extended_chars = 1;
2516 /* Frumble-bumble, we've found some
2517 extended chars. Need to start over,
2518 process everything using the general
2519 extended-char mechanism, and need to
2520 use charset_mule and charset_mule_not
2521 instead of charset and
2523 goto start_over_with_extended;
2526 /* \ might escape characters inside [...] and
2528 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) &&
2538 && !has_extended_chars) {
2539 has_extended_chars = 1;
2540 goto start_over_with_extended;
2547 /* Could be the end of the bracket
2548 expression. If it's not (i.e., when
2549 the bracket expression is `[]' so
2550 far), the ']' character bit gets set
2552 if (c == ']' && p != p1 + 1)
2555 /* Look ahead to see if it's a range
2556 when the last thing was a character
2558 if (had_char_class && c == '-'
2564 /* Look ahead to see if it's a range
2565 when the last thing was a character:
2566 if this is a hyphen not at the
2567 beginning or the end of a list, then
2568 it's the range operator. */
2570 && !(p - 2 >= pattern
2572 && !(p - 3 >= pattern
2579 if (*(const unsigned char *)p >= 0x80
2580 && !has_extended_chars) {
2581 has_extended_chars = 1;
2582 goto start_over_with_extended;
2584 if (has_extended_chars)
2585 ret = compile_extended_range
2586 (&p, pend, translate,
2591 (&p, pend, translate,
2593 if (ret != REG_NOERROR) {
2599 else if (p[0] == '-' && p[1] != ']') {
2600 /* This handles ranges made up of
2604 /* Move past the `-'. */
2608 if (*(const unsigned char*)p >= 0x80
2609 && !has_extended_chars) {
2610 has_extended_chars = 1;
2611 goto start_over_with_extended;
2613 if (has_extended_chars)
2614 ret = compile_extended_range(
2615 &p, pend, translate,
2619 ret = compile_range(
2620 &p, pend, translate,
2622 if (ret != REG_NOERROR) {
2628 /* See if we're at the beginning of a possible
2631 else if (syntax & RE_CHAR_CLASSES &&
2632 c == '[' && *p == ':') {
2633 /* Leave room for the null. */
2634 char str[CHAR_CLASS_MAX_LENGTH + 1];
2639 /* If pattern is `[[:'. */
2645 /* #### This code is unused.
2646 Correctness is not checked
2647 after TRT table change. */
2649 if (c == ':' || c == ']'
2652 CHAR_CLASS_MAX_LENGTH)
2654 str[c1++] = (char)c;
2658 /* If isn't a word bracketed by
2659 `[:' and `:]': undo the
2660 ending character, the
2661 letters, and leave the
2662 leading `:' and `[' (but set
2664 if (c == ':' && *p == ']') {
2670 if (wct == RECC_ERROR) {
2675 /* Throw away the ] at
2695 re_iswctype(ch, wct)) {
2699 if (re_iswctype(ch, wct)) {
2704 had_char_class = true;
2710 SET_EITHER_BIT('[');
2711 SET_EITHER_BIT(':');
2712 had_char_class = false;
2715 had_char_class = false;
2721 if (has_extended_chars) {
2722 /* We have a range table, not a bit vector. */
2724 unified_range_table_bytes_needed
2726 GET_BUFFER_SPACE(bytes_needed);
2727 unified_range_table_copy_data(rtab,
2730 unified_range_table_bytes_used
2735 /* Discard any (non)matching list bytes that are
2736 all 0 at the end of the map. Decrease the
2737 map-length byte too. */
2738 while ((int)buf_end[-1] > 0
2739 && buf_end[buf_end[-1] - 1] == 0)
2741 buf_end += buf_end[-1];
2746 if (syntax & RE_NO_BK_PARENS)
2752 if (syntax & RE_NO_BK_PARENS)
2758 if (syntax & RE_NEWLINE_ALT)
2764 if (syntax & RE_NO_BK_VBAR)
2770 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2771 goto handle_interval;
2781 /* Do not translate the character after the \, so that we can
2782 distinguish, e.g., \B from \b, even if we normally would
2783 translate, e.g., B to b. */
2788 if (syntax & RE_NO_BK_PARENS)
2789 goto normal_backslash;
2796 if (!(syntax & RE_NO_SHY_GROUPS)
2797 && p != pend && *p == '?') {
2818 /* Record the translation from capturing group index to
2819 register number, reallocating table as needed. */
2822 external_to_internal_register_size
2827 external_to_internal_register_size;
2829 external_to_internal_register_size
2832 external_to_internal_register,
2834 external_to_internal_register_size,
2840 external_to_internal_register_size;
2843 external_to_internal_register
2850 external_to_internal_register
2855 if (COMPILE_STACK_FULL) {
2856 RETALLOC(compile_stack.stack,
2859 compile_stack_elt_t);
2860 if (compile_stack.stack == NULL)
2863 compile_stack.size <<= 1;
2866 /* These are the values to restore when
2867 we hit end of this group. They are
2868 all relative offsets, so that if the
2869 whole pattern moves because of
2870 realloc, they will still be
2872 COMPILE_STACK_TOP.begalt_offset =
2873 begalt - bufp->buffer;
2874 COMPILE_STACK_TOP.fixup_alt_jump =
2879 COMPILE_STACK_TOP.laststart_offset =
2880 buf_end - bufp->buffer;
2881 COMPILE_STACK_TOP.regnum = r;
2883 /* We will eventually replace the 0 with
2884 the number of groups inner to this
2885 one. But do not push a start_memory
2886 for groups beyond the last one we can
2887 represent in the compiled pattern.
2888 #### bad bad bad. this will fail in
2889 lots of ways, if we ever have to
2890 backtrack for these groups.
2892 if (r <= MAX_REGNUM) {
2894 inner_group_offset =
2895 buf_end - bufp->buffer +
2897 BUF_PUSH_3(start_memory, r, 0);
2900 compile_stack.avail++;
2905 /* If we've reached MAX_REGNUM groups,
2906 then this open won't actually
2907 generate any code, so we'll have to
2908 clear pending_exact explicitly. */
2914 if (syntax & RE_NO_BK_PARENS)
2915 goto normal_backslash;
2917 if (COMPILE_STACK_EMPTY) {
2919 RE_UNMATCHED_RIGHT_PAREN_ORD) {
2920 goto normal_backslash;
2928 if (fixup_alt_jump) {
2929 /* Push a dummy failure point at the end
2930 of the alternative for a possible
2931 future `pop_failure_jump' to pop.
2932 See comments at `push_dummy_failure'
2934 BUF_PUSH(push_dummy_failure);
2936 /* We allocated space for this jump when
2937 we assigned to `fixup_alt_jump', in
2938 the `handle_alt' case below. */
2939 STORE_JUMP(jump_past_alt,
2940 fixup_alt_jump, buf_end - 1);
2943 /* See similar code for backslashed left paren
2945 if (COMPILE_STACK_EMPTY) {
2947 RE_UNMATCHED_RIGHT_PAREN_ORD) {
2955 /* Since we just checked for an empty stack
2956 above, this ``can't happen''. */
2957 assert(compile_stack.avail != 0);
2959 /* We don't just want to restore into
2960 `regnum', because later groups should
2961 continue to be numbered higher, as in
2962 `(ab)c(de)' -- the second group is
2964 regnum_t this_group_regnum;
2966 compile_stack.avail--;
2969 COMPILE_STACK_TOP.begalt_offset;
2982 COMPILE_STACK_TOP.regnum;
2983 /* If we've reached MAX_REGNUM groups,
2984 then this open won't actually
2985 generate any code, so we'll have to
2986 clear pending_exact explicitly. */
2989 /* We're at the end of the group, so now
2990 we know how many groups were inside
2992 if (this_group_regnum <= MAX_REGNUM) {
2993 unsigned char *inner_group_loc
3002 BUF_PUSH_3(stop_memory,
3010 case '|': /* `\|'. */
3011 if (syntax & RE_LIMITED_OPS
3012 || syntax & RE_NO_BK_VBAR) {
3013 goto normal_backslash;
3016 if (syntax & RE_LIMITED_OPS) {
3020 /* Insert before the previous alternative a jump
3021 which jumps to this alternative if the former
3023 GET_BUFFER_SPACE(3);
3024 INSERT_JUMP(on_failure_jump, begalt,
3029 /* The alternative before this one has a jump after it
3030 which gets executed if it gets matched. Adjust that
3031 jump so it will jump to this alternative's analogous
3032 jump (put in below, which in turn will jump to the next
3033 (if any) alternative's such jump, etc.). The last such
3034 jump jumps to the correct final destination. A picture:
3040 If we are at `b', then fixup_alt_jump right now points to a
3041 three-byte space after `a'. We'll put in the jump, set
3042 fixup_alt_jump to right after `b', and leave behind three
3043 bytes which we'll fill in when we get to after `c'. */
3046 STORE_JUMP(jump_past_alt,
3047 fixup_alt_jump, buf_end);
3049 /* Mark and leave space for a jump after this alternative,
3050 to be filled in later either by next alternative or
3051 when know we're at the end of a series of alternatives. */
3052 fixup_alt_jump = buf_end;
3053 GET_BUFFER_SPACE(3);
3061 /* If \{ is a literal. */
3062 if (!(syntax & RE_INTERVALS)
3063 /* If we're at `\{' and it's not the open-interval
3065 || ((syntax & RE_INTERVALS)
3066 && (syntax & RE_NO_BK_BRACES))
3067 || (p - 2 == pattern && p == pend))
3068 goto normal_backslash;
3072 /* If got here, then the syntax allows
3075 /* At least (most) this many matches
3077 int lower_bound = -1, upper_bound = -1;
3079 beg_interval = p - 1;
3082 if (syntax & RE_NO_BK_BRACES) {
3083 goto unfetch_interval;
3090 GET_UNSIGNED_NUMBER(lower_bound);
3093 GET_UNSIGNED_NUMBER(
3095 if (upper_bound < 0) {
3100 /* Interval such as `{1}' =>
3101 match exactly once. */
3102 upper_bound = lower_bound;
3106 || upper_bound > RE_DUP_MAX
3107 || lower_bound > upper_bound) {
3108 if (syntax & RE_NO_BK_BRACES) {
3109 goto unfetch_interval;
3116 if (!(syntax & RE_NO_BK_BRACES)) {
3125 if (syntax & RE_NO_BK_BRACES) {
3126 goto unfetch_interval;
3133 /* We just parsed a valid interval. */
3135 /* If it's invalid to have no preceding
3139 RE_CONTEXT_INVALID_OPS) {
3144 RE_CONTEXT_INDEP_OPS) {
3145 laststart = buf_end;
3147 goto unfetch_interval;
3151 /* If the upper bound is zero, don't
3152 want to succeed at all; jump from
3153 `laststart' to `b + 3', which will be
3154 the end of the buffer after we insert
3156 if (upper_bound == 0) {
3157 GET_BUFFER_SPACE(3);
3158 INSERT_JUMP(jump, laststart,
3163 /* Otherwise, we have a nontrivial interval. When
3164 we're all done, the pattern will look like:
3165 set_number_at <jump count> <upper bound>
3166 set_number_at <succeed_n count> <lower bound>
3167 succeed_n <after jump addr> <succeed_n count>
3169 jump_n <succeed_n addr> <jump count>
3170 (The upper bound and `jump_n' are omitted if
3171 `upper_bound' is 1, though.) */
3172 else { /* If the upper bound is > 1, we need to insert
3173 more at the end of the loop. */
3174 Memory_count nbytes =
3175 10 + (upper_bound > 1) * 10;
3177 GET_BUFFER_SPACE(nbytes);
3179 /* Initialize lower bound of the `succeed_n', even
3180 though it will be set during matching by its
3181 attendant `set_number_at' (inserted next),
3182 because `re_compile_fastmap' needs to know.
3183 Jump to the `jump_n' we might insert below. */
3184 INSERT_JUMP2(succeed_n,
3192 /* Code to initialize the lower bound. Insert
3193 before the `succeed_n'. The `5' is the last two
3194 bytes of this `set_number_at', plus 3 bytes of
3195 the following `succeed_n'. */
3196 insert_op2(set_number_at,
3202 if (upper_bound > 1) { /* More than one repetition is allowed, so
3203 append a backward jump to the `succeed_n'
3204 that starts this interval.
3206 When we've reached this during matching,
3207 we'll have matched the interval once, so
3208 jump back only `upper_bound - 1' times. */
3217 /* The location we want to set is the second
3218 parameter of the `jump_n'; that is `b-2' as
3219 an absolute address. `laststart' will be
3220 the `set_number_at' we're about to insert;
3221 `laststart+3' the number to set, the source
3222 for the relative address. But we are
3223 inserting into the middle of the pattern --
3224 so everything is getting moved up by 5.
3225 Conclusion: (b - 2) - (laststart + 3) + 5,
3226 i.e., b - laststart.
3228 We insert this at the beginning of the loop
3229 so that if we fail during matching, we'll
3230 reinitialize the bounds. */
3242 beg_interval = NULL;
3247 /* If an invalid interval, match the characters as literals. */
3248 assert(beg_interval);
3250 beg_interval = NULL;
3252 /* normal_char and normal_backslash need `c'. */
3255 if (!(syntax & RE_NO_BK_BRACES)) {
3256 if (p > pattern && p[-1] == '\\')
3257 goto normal_backslash;
3262 /* There is no way to specify the before_dot and after_dot
3263 operators. rms says this is ok. --karl */
3269 laststart = buf_end;
3271 /* XEmacs addition */
3272 if (c >= 0x80 || syntax_spec_code[c] == 0377) {
3276 BUF_PUSH_2(syntaxspec, syntax_spec_code[c]);
3280 laststart = buf_end;
3282 /* XEmacs addition */
3283 if (c >= 0x80 || syntax_spec_code[c] == 0377) {
3287 BUF_PUSH_2(notsyntaxspec, syntax_spec_code[c]);
3291 /* 97.2.17 jhod merged in to XEmacs from mule-2.3 */
3293 laststart = buf_end;
3295 if (c < 32 || c > 127) {
3297 return REG_ECATEGORY;
3299 BUF_PUSH_2(categoryspec, c);
3303 laststart = buf_end;
3305 if (c < 32 || c > 127) {
3307 return REG_ECATEGORY;
3309 BUF_PUSH_2(notcategoryspec, c);
3311 /* end of category patch */
3316 laststart = buf_end;
3321 laststart = buf_end;
3322 BUF_PUSH(notwordchar);
3334 BUF_PUSH(wordbound);
3338 BUF_PUSH(notwordbound);
3360 if (syntax & RE_NO_BK_REFS) {
3363 /* External register indexing. */
3366 if (reg > bufp->re_nsub) {
3371 /* Convert external to internal as soon
3373 reg = bufp->external_to_internal_register[reg];
3375 /* Can't back reference to a
3376 subexpression if inside it. */
3377 if (group_in_compile_stack(
3378 compile_stack, reg)) {
3381 laststart = buf_end;
3382 BUF_PUSH_2(duplicate, reg);
3388 if (syntax & RE_BK_PLUS_QM) {
3391 goto normal_backslash;
3395 /* You might think it would be useful for \ to
3396 mean not to translate; but if we don't
3397 translate it, it will never match
3404 /* Expects the character in `c'. */
3405 /* `p' points to the location after where `c' came
3409 /* XEmacs: modifications here for Mule. */
3410 /* `q' points to the beginning of the next char. */
3413 /* If no exactn currently being built. */
3415 /* If last exactn not at current position. */
3416 || pending_exact + *pending_exact + 1 !=
3418 /* We have only one byte following the exactn for
3420 || ((unsigned int)(*pending_exact + (q - p)) >=
3421 ((unsigned int)(1 << BYTEWIDTH) - 1))
3423 /* If followed by a repetition operator. */
3424 || *q == '*' || *q == '^'
3425 || ((syntax & RE_BK_PLUS_QM)
3426 ? *q == '\\' && (q[1] == '+'
3428 : (*q == '+' || *q == '?'))
3429 || ((syntax & RE_INTERVALS)
3430 && ((syntax & RE_NO_BK_BRACES)
3432 : (q[0] == '\\' && q[1] == '{')))) {
3433 /* Start building a new exactn. */
3435 laststart = buf_end;
3437 BUF_PUSH_2(exactn, 0);
3438 pending_exact = buf_end - 1;
3446 Bufbyte tmp_buf[MAX_EMCHAR_LEN];
3450 set_charptr_emchar(tmp_buf, c);
3452 for (i = 0; i < bt_count; i++) {
3453 BUF_PUSH(tmp_buf[i]);
3461 } /* while p != pend */
3463 /* Through the pattern now. */
3465 if (fixup_alt_jump) {
3466 STORE_JUMP(jump_past_alt, fixup_alt_jump, buf_end);
3468 if (!COMPILE_STACK_EMPTY) {
3472 /* If we don't want backtracking, force success
3473 the first time we reach the end of the compiled pattern. */
3474 if (syntax & RE_NO_POSIX_BACKTRACKING) {
3477 xfree(compile_stack.stack);
3479 /* We have succeeded; set the length of the buffer. */
3480 bufp->used = buf_end - bufp->buffer;
3482 #ifdef REGEX_DEBUG_FLAG
3484 DEBUG_PRINT1("\nCompiled pattern: \n");
3485 print_compiled_pattern(bufp);
3489 #ifndef MATCH_MAY_ALLOCATE
3490 /* Initialize the failure stack to the largest possible stack. This
3491 isn't necessary unless we're trying to avoid calling alloca in
3492 the search and match routines. */
3494 int num_regs = bufp->re_ngroups + 1;
3496 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
3497 is strictly greater than re_max_failures, the largest possible stack
3498 is 2 * re_max_failures failure points. */
3499 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS)) {
3501 (2 * re_max_failures * MAX_FAILURE_ITEMS);
3503 if (!fail_stack.stack) {
3505 (fail_stack_elt_t *)
3508 sizeof(fail_stack_elt_t));
3511 (fail_stack_elt_t *)
3512 xrealloc(fail_stack.stack,
3514 sizeof(fail_stack_elt_t)));
3518 regex_grow_registers(num_regs);
3520 #endif /* not MATCH_MAY_ALLOCATE */
3526 /* Subroutines for `regex_compile'. */
3528 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3530 static void store_op1(re_opcode_t op, unsigned char *loc, int arg)
3532 *loc = (unsigned char)op;
3533 STORE_NUMBER(loc + 1, arg);
3536 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3538 static void store_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2)
3540 *loc = (unsigned char)op;
3541 STORE_NUMBER(loc + 1, arg1);
3542 STORE_NUMBER(loc + 3, arg2);
3545 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3546 for OP followed by two-byte integer parameter ARG. */
3549 insert_op1(re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
3551 REGISTER unsigned char *pfrom = end;
3552 REGISTER unsigned char *pto = end + 3;
3554 while (pfrom != loc)
3557 store_op1(op, loc, arg);
3560 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3563 insert_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2,
3566 REGISTER unsigned char *pfrom = end;
3567 REGISTER unsigned char *pto = end + 5;
3569 while (pfrom != loc)
3572 store_op2(op, loc, arg1, arg2);
3575 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3576 after an alternative or a begin-subexpression. We assume there is at
3577 least one character before the ^. */
3580 at_begline_loc_p(re_char *pattern, re_char *p, reg_syntax_t syntax)
3582 re_char *prev = p - 2;
3583 re_bool prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3586 /* After a subexpression? */
3587 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3588 /* After an alternative? */
3590 && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3593 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3594 at least one character after the $, i.e., `P < PEND'. */
3597 at_endline_loc_p(re_char *p, re_char *pend, int syntax)
3600 re_bool next_backslash = *next == '\\';
3601 re_char *next_next = p + 1 < pend ? p + 1 : 0;
3604 /* Before a subexpression? */
3605 (syntax & RE_NO_BK_PARENS ? *next == ')'
3606 : next_backslash && next_next && *next_next == ')')
3607 /* Before an alternative? */
3608 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3609 : next_backslash && next_next && *next_next == '|');
3612 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3613 false if it's not. */
3616 group_in_compile_stack(compile_stack_type compile_stack, regnum_t regnum)
3620 for (this_element = compile_stack.avail - 1;
3621 this_element >= 0; this_element--)
3622 if (compile_stack.stack[this_element].regnum == regnum) {
3628 /* Read the ending character of a range (in a bracket expression) from the
3629 uncompiled pattern *P_PTR (which ends at PEND). We assume the
3630 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
3631 Then we set the translation of all bits between the starting and
3632 ending characters (inclusive) in the compiled pattern B.
3634 Return an error code.
3636 We use these short variable names so we can use the same macros as
3637 `regex_compile' itself. */
3639 static reg_errcode_t
3640 compile_range(re_char ** p_ptr, re_char * pend, RE_TRANSLATE_TYPE translate,
3641 reg_syntax_t syntax, unsigned char *buf_end)
3643 Element_count this_char;
3645 re_char *p = *p_ptr;
3646 int range_start, range_end;
3651 /* Even though the pattern is a signed `char *', we need to fetch
3652 with unsigned char *'s; if the high bit of the pattern character
3653 is set, the range endpoints will be negative if we fetch using a
3656 We also want to fetch the endpoints without translating them; the
3657 appropriate translation is done in the bit-setting loop below. */
3658 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
3659 range_start = ((const unsigned char *)p)[-2];
3660 range_end = ((const unsigned char *)p)[0];
3662 /* Have to increment the pointer into the pattern string, so the
3663 caller isn't still at the ending character. */
3666 /* If the start is after the end, the range is empty. */
3667 if (range_start > range_end)
3668 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3670 /* Here we see why `this_char' has to be larger than an `unsigned
3671 char' -- the range is inclusive, so if `range_end' == 0xff
3672 (assuming 8-bit characters), we would otherwise go into an infinite
3673 loop, since all characters <= 0xff. */
3674 for (this_char = range_start; this_char <= range_end; this_char++) {
3675 SET_LIST_BIT(TRANSLATE(this_char));
3683 static reg_errcode_t
3684 compile_extended_range(re_char ** p_ptr, re_char * pend,
3685 RE_TRANSLATE_TYPE translate,
3686 reg_syntax_t syntax, Lisp_Object rtab)
3688 Emchar this_char, range_start, range_end;
3694 p = (const Bufbyte *)*p_ptr;
3695 range_end = charptr_emchar(p);
3696 p--; /* back to '-' */
3697 DEC_CHARPTR(p); /* back to start of range */
3698 /* We also want to fetch the endpoints without translating them; the
3699 appropriate translation is done in the bit-setting loop below. */
3700 range_start = charptr_emchar(p);
3701 INC_CHARPTR(*p_ptr);
3703 /* If the start is after the end, the range is empty. */
3704 if (range_start > range_end)
3705 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3707 /* Can't have ranges spanning different charsets, except maybe for
3708 ranges entirely within the first 256 chars. */
3710 if ((range_start >= 0x100 || range_end >= 0x100)
3711 && CHAR_LEADING_BYTE(range_start) != CHAR_LEADING_BYTE(range_end))
3712 return REG_ERANGESPAN;
3714 /* As advertised, translations only work over the 0 - 0x7F range.
3715 Making this kind of stuff work generally is much harder.
3716 Iterating over the whole range like this would be way efficient
3717 if the range encompasses 10,000 chars or something. You'd have
3718 to do something like this:
3722 map over translation table in [range_start, range_end] of
3723 (put the mapped range in a;
3724 put the translation in b)
3725 invert the range in a and truncate to [range_start, range_end]
3726 compute the union of a, b
3727 union the result into rtab
3729 for (this_char = range_start;
3730 this_char <= range_end && this_char < 0x80; this_char++) {
3731 SET_RANGETAB_BIT(TRANSLATE(this_char));
3734 if (this_char <= range_end)
3735 put_range_table(rtab, this_char, range_end, Qt);
3742 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3743 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3744 characters can start a string that matches the pattern. This fastmap
3745 is used by re_search to skip quickly over impossible starting points.
3747 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3748 area as BUFP->fastmap.
3750 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3753 Returns 0 if we succeed, -2 if an internal error. */
3755 int re_compile_fastmap(struct re_pattern_buffer *bufp)
3758 #ifdef MATCH_MAY_ALLOCATE
3759 fail_stack_type fail_stack;
3761 DECLARE_DESTINATION;
3762 /* We don't push any register information onto the failure stack. */
3764 REGISTER char *fastmap = bufp->fastmap;
3765 unsigned char *pattern = bufp->buffer;
3766 unsigned long size = bufp->used;
3767 /* actually p supposed to carry the const qualifier, however, some
3768 silly mule code below CHANGES p and hence we cant go with the const
3769 qualifier here, leaving an `unfixable' warning on the way */
3771 unsigned char *p = pattern;
3773 re_char *p = pattern;
3775 REGISTER unsigned char *pend = pattern + size;
3778 /* This holds the pointer to the failure stack, when
3779 it is allocated relocatably. */
3780 fail_stack_elt_t *failure_stack_ptr;
3783 /* Assume that each path through the pattern can be null until
3784 proven otherwise. We set this false at the bottom of switch
3785 statement, to which we get only if a particular path doesn't
3786 match the empty string. */
3787 re_bool path_can_be_null = true;
3789 /* We aren't doing a `succeed_n' to begin with. */
3790 re_bool succeed_n_p = false;
3792 assert(fastmap != NULL && p != NULL);
3795 memset(fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3796 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3797 bufp->can_be_null = 0;
3800 if (p == pend || *p == succeed) {
3801 /* We have reached the (effective) end of pattern. */
3802 if (!FAIL_STACK_EMPTY()) {
3803 bufp->can_be_null |= path_can_be_null;
3805 /* Reset for next path. */
3806 path_can_be_null = true;
3808 /* fuck, p isnt const thanks to that
3809 * unified range table function below */
3811 p = (unsigned char*)fail_stack.
3812 stack[--fail_stack.avail].pointer;
3814 p = fail_stack.stack[--fail_stack.avail]
3824 /* We should never be about to go beyond the end of the pattern. */
3827 switch (SWITCH_ENUM_CAST((re_opcode_t) * p++)) {
3829 /* I guess the idea here is to simply not bother with a
3830 fastmap if a backreference is used, since it's too
3831 hard to figure out the fastmap for the corresponding
3832 group. Setting `can_be_null' stops `re_search_2'
3833 from using the fastmap, so that is all we do. */
3835 bufp->can_be_null = 1;
3838 /* Following are the cases which match a character.
3839 These end with `break'. */
3846 /* XEmacs: Under Mule, these bit vectors will
3847 only contain values for characters below 0x80. */
3848 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3849 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3854 /* Chars beyond end of map must be allowed. */
3856 for (j = *p * BYTEWIDTH; j < 0x80; j++)
3858 /* And all extended characters must be allowed, too. */
3859 for (j = 0x80; j < 0xA0; j++)
3861 #else /* not MULE */
3862 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3866 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3868 (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3873 case charset_mule: {
3877 nentries = unified_range_table_nentries(p);
3878 for (i = 0; i < nentries; i++) {
3879 EMACS_INT first, last;
3880 Lisp_Object dummy_val;
3882 Bufbyte strr[MAX_EMCHAR_LEN];
3884 unified_range_table_get_range(p, i,
3889 jj <= last && jj < 0x80; jj++)
3891 /* Ranges below 0x100 can span charsets, but there
3892 are only two (Control-1 and Latin-1), and
3893 either first or last has to be in them. */
3894 set_charptr_emchar(strr, first);
3897 set_charptr_emchar(strr, last);
3904 case charset_mule_not: {
3908 nentries = unified_range_table_nentries(p);
3909 for (i = 0; i < nentries; i++) {
3910 EMACS_INT first, last;
3911 Lisp_Object dummy_val;
3913 int smallest_prev = 0;
3915 unified_range_table_get_range(p, i,
3919 for (jj = smallest_prev;
3920 jj < first && jj < 0x80; jj++)
3922 smallest_prev = last + 1;
3923 if (smallest_prev >= 0x80)
3926 /* Calculating which leading bytes are actually allowed
3927 here is rather difficult, so we just punt and allow
3929 for (i = 0x80; i < 0xA0; i++)
3940 for (j = 0; j < (1 << BYTEWIDTH); j++)
3943 (regex_emacs_buffer->mirror_syntax_table),
3952 goto matchnotsyntax;
3954 for (j = 0; j < (1 << BYTEWIDTH); j++)
3957 (regex_emacs_buffer->mirror_syntax_table),
3964 int fastmap_newline = fastmap['\n'];
3966 /* `.' matches anything ... */
3968 /* "anything" only includes bytes that can be the
3969 first byte of a character. */
3970 for (j = 0; j < 0xA0; j++)
3973 for (j = 0; j < (1 << BYTEWIDTH); j++)
3977 /* ... except perhaps newline. */
3978 if (!(bufp->syntax & RE_DOT_NEWLINE))
3979 fastmap['\n'] = fastmap_newline;
3981 /* Return if we have already set `can_be_null'; if we
3982 have, then the fastmap is irrelevant. Something's
3984 else if (bufp->can_be_null)
3987 /* Otherwise, have to check alternative paths. */
3998 /* This match depends on text properties. These end with
3999 aborting optimizations. */
4000 bufp->can_be_null = 1;
4006 for (j = 0; j < 0x80; j++)
4009 (regex_emacs_buffer->mirror_syntax_table),
4010 j) == (enum syntaxcode)k)
4012 for (j = 0x80; j < 0xA0; j++) {
4013 if (LEADING_BYTE_PREFIX_P(j))
4014 /* too complicated to calculate this right */
4020 cset = CHARSET_BY_LEADING_BYTE(j);
4021 if (CHARSETP(cset)) {
4023 (regex_emacs_buffer, cset,
4025 == Sword || multi_p)
4030 #else /* not MULE */
4031 for (j = 0; j < (1 << BYTEWIDTH); j++)
4034 (regex_emacs_buffer->mirror_syntax_table),
4035 j) == (enum syntaxcode)k)
4042 for (j = 0; j < 0x80; j++)
4045 (regex_emacs_buffer->mirror_syntax_table),
4046 j) != (enum syntaxcode)k)
4048 for (j = 0x80; j < 0xA0; j++) {
4049 if (LEADING_BYTE_PREFIX_P(j))
4050 /* too complicated to calculate this right */
4056 cset = CHARSET_BY_LEADING_BYTE(j);
4057 if (CHARSETP(cset)) {
4059 (regex_emacs_buffer, cset,
4061 != Sword || multi_p)
4066 #else /* not MULE */
4067 for (j = 0; j < (1 << BYTEWIDTH); j++)
4070 (regex_emacs_buffer->mirror_syntax_table),
4071 j) != (enum syntaxcode)k)
4078 /* 97/2/17 jhod category patch */
4080 case notcategoryspec:
4081 bufp->can_be_null = 1;
4083 /* end if category patch */
4086 /* All cases after this match the empty string. These
4087 end with `continue'. */
4106 case push_dummy_failure:
4110 case pop_failure_jump:
4111 case maybe_pop_jump:
4114 case dummy_failure_jump:
4115 EXTRACT_NUMBER_AND_INCR(j, p);
4120 /* Jump backward implies we just went through the body
4121 of a loop and matched nothing. Opcode jumped to
4122 should be `on_failure_jump' or `succeed_n'. Just
4123 treat it like an ordinary jump. For a * loop, it has
4124 pushed its failure point already; if so, discard that
4126 if ((re_opcode_t) * p != on_failure_jump
4127 && (re_opcode_t) * p != succeed_n)
4131 EXTRACT_NUMBER_AND_INCR(j, p);
4134 /* If what's on the stack is where we are now, pop
4136 if (!FAIL_STACK_EMPTY()
4137 && fail_stack.stack[fail_stack.avail - 1].pointer ==
4143 case on_failure_jump:
4144 case on_failure_keep_string_jump:
4145 handle_on_failure_jump:
4146 EXTRACT_NUMBER_AND_INCR(j, p);
4148 /* For some patterns, e.g., `(a?)?', `p+j' here points
4149 to the end of the pattern. We don't want to push
4150 such a point, since when we restore it above,
4151 entering the switch will increment `p' past the end
4152 of the pattern. We don't need to push such a point
4153 since we obviously won't find any more fastmap
4154 entries beyond `pend'. Such a pattern can match the
4155 null string, though. */
4157 if (!PUSH_PATTERN_OP(p + j, fail_stack)) {
4162 bufp->can_be_null = 1;
4165 EXTRACT_NUMBER_AND_INCR(k, p); /* Skip the n. */
4166 succeed_n_p = false;
4172 /* Get to the number of times to succeed. */
4175 /* Increment p past the n for when k != 0. */
4176 EXTRACT_NUMBER_AND_INCR(k, p);
4179 succeed_n_p = true; /* Spaghetti code alert. */
4180 goto handle_on_failure_jump;
4195 abort(); /* We have listed all the cases. */
4198 /* Getting here means we have found the possible starting
4199 characters for one path of the pattern -- and that the empty
4200 string does not match. We need not follow this path further.
4201 Instead, look at the next alternative (remembered on the
4202 stack), or quit if no more. The test at the top of the loop
4203 does these things. */
4204 path_can_be_null = false;
4208 /* Set `can_be_null' for the last path (also the first path, if the
4209 pattern is empty). */
4210 bufp->can_be_null |= path_can_be_null;
4215 } /* re_compile_fastmap */
4217 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4218 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4219 this memory for recording register information. STARTS and ENDS
4220 must be allocated using the malloc library routine, and must each
4221 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4223 If NUM_REGS == 0, then subsequent matches should allocate their own
4226 Unless this function is called, the first search or match using
4227 PATTERN_BUFFER will allocate its own register data, without
4228 freeing the old data. */
4231 re_set_registers(struct re_pattern_buffer *bufp, struct re_registers *regs,
4232 unsigned num_regs, regoff_t * starts, regoff_t * ends)
4235 bufp->regs_allocated = REGS_REALLOCATE;
4236 regs->num_regs = num_regs;
4237 regs->start = starts;
4240 bufp->regs_allocated = REGS_UNALLOCATED;
4242 regs->start = regs->end = (regoff_t *) 0;
4246 /* Searching routines. */
4248 /* Like re_search_2, below, but only one string is specified, and
4249 doesn't let you say where to stop matching. */
4252 re_search(struct re_pattern_buffer *bufp, const char *string, int size,
4253 int startpos, int range, struct re_registers *regs)
4255 return re_search_2(bufp, NULL, 0, string, size, startpos, range,
4260 /* Snarfed from src/lisp.h, needed for compiling [ce]tags. */
4261 # define bytecount_to_charcount(ptr, len) (len)
4262 # define charcount_to_bytecount(ptr, len) (len)
4263 typedef int Charcount;
4266 /* Using the compiled pattern in BUFP->buffer, first tries to match the
4267 virtual concatenation of STRING1 and STRING2, starting first at index
4268 STARTPOS, then at STARTPOS + 1, and so on.
4270 With MULE, STARTPOS is a byte position, not a char position. And the
4271 search will increment STARTPOS by the width of the current leading
4274 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
4276 RANGE is how far to scan while trying to match. RANGE = 0 means try
4277 only at STARTPOS; in general, the last start tried is STARTPOS +
4280 With MULE, RANGE is a byte position, not a char position. The last
4281 start tried is the character starting <= STARTPOS + RANGE.
4283 In REGS, return the indices of the virtual concatenation of STRING1
4284 and STRING2 that matched the entire BUFP->buffer and its contained
4287 Do not consider matching one past the index STOP in the virtual
4288 concatenation of STRING1 and STRING2.
4290 We return either the position in the strings at which the match was
4291 found, -1 if no match, or -2 if error (such as failure
4295 re_search_2(struct re_pattern_buffer *bufp, const char *str1,
4296 int size1, const char *str2, int size2, int startpos,
4297 int range, struct re_registers *regs, int stop)
4300 re_char *string1 = (re_char *) str1;
4301 re_char *string2 = (re_char *) str2;
4302 REGISTER char *fastmap = bufp->fastmap;
4303 REGISTER RE_TRANSLATE_TYPE translate = bufp->translate;
4304 int total_size = size1 + size2;
4305 int endpos = startpos + range;
4306 #ifdef REGEX_BEGLINE_CHECK
4307 int anchored_at_begline = 0;
4312 /* Check for out-of-range STARTPOS. */
4313 if (startpos < 0 || startpos > total_size)
4316 /* Fix up RANGE if it might eventually take us outside
4317 the virtual concatenation of STRING1 and STRING2. */
4319 range = 0 - startpos;
4320 else if (endpos > total_size)
4321 range = total_size - startpos;
4323 /* If the search isn't to be a backwards one, don't waste time in a
4324 search for a pattern that must be anchored. */
4325 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf
4330 d = ((const unsigned char *)
4332 size1 ? string2 - size1 : string1) + startpos);
4333 range = charcount_to_bytecount(d, 1);
4337 /* In a forward search for something that starts with \=.
4338 don't keep searching past point. */
4339 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot
4342 BUF_PT(regex_emacs_buffer) - BUF_BEGV(regex_emacs_buffer)
4349 /* Update the fastmap now if not correct already. */
4350 if (fastmap && !bufp->fastmap_accurate)
4351 if (re_compile_fastmap(bufp) == -2)
4354 #ifdef REGEX_BEGLINE_CHECK
4356 unsigned long i = 0;
4358 while (i < bufp->used) {
4359 if (bufp->buffer[i] == start_memory ||
4360 bufp->buffer[i] == stop_memory)
4365 anchored_at_begline = i < bufp->used
4366 && bufp->buffer[i] == begline;
4371 SETUP_SYNTAX_CACHE_FOR_OBJECT(regex_match_object,
4373 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR
4374 (regex_match_object, regex_emacs_buffer,
4378 /* Loop through the string, looking for a place to start matching. */
4380 #ifdef REGEX_BEGLINE_CHECK
4381 /* If the regex is anchored at the beginning of a line (i.e. with a ^),
4382 then we can speed things up by skipping to the next beginning-of-
4384 if (anchored_at_begline && startpos > 0 && startpos != size1 &&
4386 /* whose stupid idea was it anyway to make this
4387 function take two strings to match?? */
4391 if (startpos < size1 && startpos + range >= size1)
4392 lim = range - (size1 - startpos);
4394 d = ((const unsigned char *)
4396 size1 ? string2 - size1 : string1) + startpos);
4397 DEC_CHARPTR(d); /* Ok, since startpos != size1. */
4398 d_size = charcount_to_bytecount(d, 1);
4400 if (TRANSLATE_P(translate))
4401 while (range > lim && *d != '\n') {
4402 d += d_size; /* Speedier INC_CHARPTR(d) */
4403 d_size = charcount_to_bytecount(d, 1);
4406 while (range > lim && *d != '\n') {
4407 d += d_size; /* Speedier INC_CHARPTR(d) */
4408 d_size = charcount_to_bytecount(d, 1);
4412 startpos += irange - range;
4414 #endif /* REGEX_BEGLINE_CHECK */
4416 /* If a fastmap is supplied, skip quickly over characters that
4417 cannot be the start of a match. If the pattern can match the
4418 null string, however, we don't need to skip characters; we want
4419 the first null string. */
4420 if (fastmap && startpos < total_size && !bufp->can_be_null) {
4421 if (range > 0) { /* Searching forwards. */
4425 if (startpos < size1
4426 && startpos + range >= size1)
4427 lim = range - (size1 - startpos);
4429 d = ((const unsigned char *)
4431 size1 ? string2 - size1 : string1) +
4434 /* Written out as an if-else to avoid testing `translate'
4436 if (TRANSLATE_P(translate))
4437 while (range > lim) {
4440 Bufbyte str[MAX_EMCHAR_LEN];
4442 buf_ch = charptr_emchar(d);
4443 buf_ch = RE_TRANSLATE(buf_ch);
4444 set_charptr_emchar(str, buf_ch);
4446 || fastmap[(unsigned char)
4456 charcount_to_bytecount(d,
4459 d += d_size; /* Speedier INC_CHARPTR(d) */
4461 while (range > lim && !fastmap[*d]) {
4463 charcount_to_bytecount(d,
4466 d += d_size; /* Speedier INC_CHARPTR(d) */
4469 startpos += irange - range;
4470 } else { /* Searching backwards. */
4472 Emchar c = (size1 == 0 || startpos >= size1
4473 ? charptr_emchar(string2 +
4475 : charptr_emchar(string1 +
4479 if (!(c >= 0200 || fastmap[(unsigned char)c]))
4482 if (!fastmap[(unsigned char)c])
4488 /* If can't match the null string, and that's all we have left, fail. */
4489 if (range >= 0 && startpos == total_size && fastmap
4490 && !bufp->can_be_null)
4493 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4494 if (!no_quit_in_re_search)
4497 val = re_match_2_internal(bufp, string1, size1, string2, size2,
4498 startpos, regs, stop);
4499 #ifndef REGEX_MALLOC
4514 else if (range > 0) {
4515 d = ((const unsigned char *)
4517 size1 ? string2 - size1 : string1) + startpos);
4518 d_size = charcount_to_bytecount(d, 1);
4522 /* Note startpos > size1 not >=. If we are on the
4523 string1/string2 boundary, we want to backup into string1. */
4524 d = ((const unsigned char *)
4526 size1 ? string2 - size1 : string1) + startpos);
4528 d_size = charcount_to_bytecount(d, 1);
4536 /* Declarations and macros for re_match_2. */
4538 /* This converts PTR, a pointer into one of the search strings `string1'
4539 and `string2' into an offset from the beginning of that string. */
4540 #define POINTER_TO_OFFSET(ptr) \
4541 (FIRST_STRING_P (ptr) \
4542 ? ((regoff_t) ((ptr) - string1)) \
4543 : ((regoff_t) ((ptr) - string2 + size1)))
4545 /* Macros for dealing with the split strings in re_match_2. */
4547 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
4549 /* Call before fetching a character with *d. This switches over to
4550 string2 if necessary. */
4551 #define REGEX_PREFETCH() \
4554 /* End of string2 => fail. */ \
4555 if (dend == end_match_2) \
4557 /* End of string1 => advance to string2. */ \
4559 dend = end_match_2; \
4562 /* Test if at very beginning or at very end of the virtual concatenation
4563 of `string1' and `string2'. If only one string, it's `string2'. */
4564 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4565 #define AT_STRINGS_END(d) ((d) == end2)
4568 If the given position straddles the string gap, return the equivalent
4569 position that is before or after the gap, respectively; otherwise,
4570 return the same position. */
4571 #define POS_BEFORE_GAP_UNSAFE(d) ((d) == string2 ? end1 : (d))
4572 #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d))
4574 /* Test if CH is a word-constituent character. (XEmacs change) */
4575 #define WORDCHAR_P_UNSAFE(ch) \
4576 (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), \
4579 /* Free everything we malloc. */
4580 #ifdef MATCH_MAY_ALLOCATE
4581 #define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
4582 #define FREE_VARIABLES() \
4584 REGEX_FREE_STACK (fail_stack.stack); \
4585 FREE_VAR (regstart); \
4586 FREE_VAR (regend); \
4587 FREE_VAR (old_regstart); \
4588 FREE_VAR (old_regend); \
4589 FREE_VAR (best_regstart); \
4590 FREE_VAR (best_regend); \
4591 FREE_VAR (reg_info); \
4592 FREE_VAR (reg_dummy); \
4593 FREE_VAR (reg_info_dummy); \
4595 #else /* not MATCH_MAY_ALLOCATE */
4596 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4597 #endif /* MATCH_MAY_ALLOCATE */
4599 /* These values must meet several constraints. They must not be valid
4600 register values; since we have a limit of 255 registers (because
4601 we use only one byte in the pattern for the register number), we can
4602 use numbers larger than 255. They must differ by 1, because of
4603 NUM_FAILURE_ITEMS above. And the value for the lowest register must
4604 be larger than the value for the highest register, so we do not try
4605 to actually save any registers when none are active. */
4606 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
4607 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
4609 /* Matching routines. */
4611 #ifndef emacs /* Emacs never uses this. */
4612 /* re_match is like re_match_2 except it takes only a single string. */
4615 re_match(struct re_pattern_buffer *bufp, const char *string, int size,
4616 int pos, struct re_registers *regs)
4619 re_match_2_internal(bufp, NULL, 0, (re_char *) string, size,
4624 #endif /* not emacs */
4626 /* re_match_2 matches the compiled pattern in BUFP against the
4627 (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 and
4628 SIZE2, respectively). We start matching at POS, and stop matching
4631 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4632 store offsets for the substring each group matched in REGS. See the
4633 documentation for exactly how many groups we fill.
4635 We return -1 if no match, -2 if an internal error (such as the
4636 failure stack overflowing). Otherwise, we return the length of the
4637 matched substring. */
4640 re_match_2(struct re_pattern_buffer *bufp, const char *string1,
4641 int size1, const char *string2, int size2, int pos,
4642 struct re_registers *regs, int stop)
4647 SETUP_SYNTAX_CACHE_FOR_OBJECT(regex_match_object,
4649 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR
4650 (regex_match_object, regex_emacs_buffer,
4654 result = re_match_2_internal(bufp, (re_char *) string1, size1,
4655 (re_char *) string2, size2,
4662 /* This is a separate function so that we can force an alloca cleanup
4665 re_match_2_internal(struct re_pattern_buffer *bufp, re_char * string1,
4666 int size1, re_char * string2, int size2, int pos,
4667 struct re_registers *regs, int stop)
4669 /* General temporaries. */
4672 int should_succeed; /* XEmacs change */
4674 /* Just past the end of the corresponding string. */
4675 re_char *end1, *end2;
4677 /* Pointers into string1 and string2, just past the last characters in
4678 each to consider matching. */
4679 re_char *end_match_1, *end_match_2;
4681 /* Where we are in the data, and the end of the current string. */
4684 /* Where we are in the pattern, and the end of the pattern. */
4685 unsigned char *p = bufp->buffer;
4686 REGISTER unsigned char *pend = p + bufp->used;
4688 /* Mark the opcode just after a start_memory, so we can test for an
4689 empty subpattern when we get to the stop_memory. */
4690 re_char *just_past_start_mem = 0;
4692 /* We use this to map every character in the string. */
4693 RE_TRANSLATE_TYPE translate = bufp->translate;
4695 /* Failure point stack. Each place that can handle a failure further
4696 down the line pushes a failure point on this stack. It consists of
4697 restart, regend, and reg_info for all registers corresponding to
4698 the subexpressions we're currently inside, plus the number of such
4699 registers, and, finally, two char *'s. The first char * is where
4700 to resume scanning the pattern; the second one is where to resume
4701 scanning the strings. If the latter is zero, the failure point is
4702 a ``dummy''; if a failure happens and the failure point is a dummy,
4703 it gets discarded and the next one is tried. */
4704 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4705 fail_stack_type fail_stack;
4707 #ifdef REGEX_DEBUG_FLAG
4708 static unsigned failure_id;
4709 int nfailure_points_pushed = 0, nfailure_points_popped = 0;
4713 /* This holds the pointer to the failure stack, when
4714 it is allocated relocatably. */
4715 fail_stack_elt_t *failure_stack_ptr;
4718 /* We fill all the registers internally, independent of what we
4719 return, for use in backreferences. The number here includes
4720 an element for register zero. */
4721 int num_regs = bufp->re_ngroups + 1;
4723 /* The currently active registers. */
4724 int lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4725 int highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4727 /* Information on the contents of registers. These are pointers into
4728 the input strings; they record just what was matched (on this
4729 attempt) by a subexpression part of the pattern, that is, the
4730 regnum-th regstart pointer points to where in the pattern we began
4731 matching and the regnum-th regend points to right after where we
4732 stopped matching the regnum-th subexpression. (The zeroth register
4733 keeps track of what the whole pattern matches.) */
4734 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4735 re_char **regstart, **regend;
4738 /* If a group that's operated upon by a repetition operator fails to
4739 match anything, then the register for its start will need to be
4740 restored because it will have been set to wherever in the string we
4741 are when we last see its open-group operator. Similarly for a
4743 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4744 re_char **old_regstart, **old_regend;
4747 /* The is_active field of reg_info helps us keep track of which (possibly
4748 nested) subexpressions we are currently in. The matched_something
4749 field of reg_info[reg_num] helps us tell whether or not we have
4750 matched any of the pattern so far this time through the reg_num-th
4751 subexpression. These two fields get reset each time through any
4752 loop their register is in. */
4753 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4754 register_info_type *reg_info;
4757 /* The following record the register info as found in the above
4758 variables when we find a match better than any we've seen before.
4759 This happens as we backtrack through the failure points, which in
4760 turn happens only if we have not yet matched the entire string. */
4761 unsigned best_regs_set = false;
4762 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4763 re_char **best_regstart, **best_regend;
4766 /* Logically, this is `best_regend[0]'. But we don't want to have to
4767 allocate space for that if we're not allocating space for anything
4768 else (see below). Also, we never need info about register 0 for
4769 any of the other register vectors, and it seems rather a kludge to
4770 treat `best_regend' differently than the rest. So we keep track of
4771 the end of the best match so far in a separate variable. We
4772 initialize this to NULL so that when we backtrack the first time
4773 and need to test it, it's not garbage. */
4774 re_char *match_end = NULL;
4776 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
4777 int set_regs_matched_done = 0;
4779 /* Used when we pop values we don't care about. */
4780 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4781 re_char **reg_dummy;
4782 register_info_type *reg_info_dummy;
4785 #ifdef REGEX_DEBUG_FLAG
4786 /* Counts the total number of registers pushed. */
4787 unsigned num_regs_pushed = 0;
4790 /* 1 if this match ends in the same string (string1 or string2)
4791 as the best previous match. */
4794 /* 1 if this match is the best seen so far. */
4795 re_bool best_match_p;
4797 DEBUG_PRINT1("\n\nEntering re_match_2.\n");
4801 #ifdef MATCH_MAY_ALLOCATE
4802 /* Do not bother to initialize all the register variables if there are
4803 no groups in the pattern, as it takes a fair amount of time. If
4804 there are groups, we include space for register 0 (the whole
4805 pattern), even though we never use it, since it simplifies the
4806 array indexing. We should fix this. */
4807 if (bufp->re_ngroups) {
4808 regstart = REGEX_TALLOC(num_regs, re_char *);
4809 regend = REGEX_TALLOC(num_regs, re_char *);
4810 old_regstart = REGEX_TALLOC(num_regs, re_char *);
4811 old_regend = REGEX_TALLOC(num_regs, re_char *);
4812 best_regstart = REGEX_TALLOC(num_regs, re_char *);
4813 best_regend = REGEX_TALLOC(num_regs, re_char *);
4814 reg_info = REGEX_TALLOC(num_regs, register_info_type);
4815 reg_dummy = REGEX_TALLOC(num_regs, re_char *);
4816 reg_info_dummy = REGEX_TALLOC(num_regs, register_info_type);
4819 (regstart && regend && old_regstart && old_regend
4820 && reg_info && best_regstart && best_regend && reg_dummy
4821 && reg_info_dummy)) {
4826 /* We must initialize all our variables to NULL, so that
4827 `FREE_VARIABLES' doesn't try to free them. */
4828 regstart = regend = old_regstart = old_regend = best_regstart
4829 = best_regend = reg_dummy = NULL;
4830 reg_info = reg_info_dummy = (register_info_type *) NULL;
4832 #endif /* MATCH_MAY_ALLOCATE */
4834 /* The starting position is bogus. */
4835 if (pos < 0 || pos > size1 + size2) {
4840 /* Initialize subexpression text positions to -1 to mark ones that no
4841 start_memory/stop_memory has been seen for. Also initialize the
4842 register information struct. */
4843 for (mcnt = 1; mcnt < num_regs; mcnt++) {
4844 regstart[mcnt] = regend[mcnt]
4845 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
4847 REG_MATCH_NULL_STRING_P(reg_info[mcnt]) =
4848 MATCH_NULL_UNSET_VALUE;
4849 IS_ACTIVE(reg_info[mcnt]) = 0;
4850 MATCHED_SOMETHING(reg_info[mcnt]) = 0;
4851 EVER_MATCHED_SOMETHING(reg_info[mcnt]) = 0;
4853 /* We move `string1' into `string2' if the latter's empty -- but not if
4854 `string1' is null. */
4855 if (size2 == 0 && string1 != NULL) {
4861 end1 = string1 + size1;
4862 end2 = string2 + size2;
4864 /* Compute where to stop matching, within the two strings. */
4865 if (stop <= size1) {
4866 end_match_1 = string1 + stop;
4867 end_match_2 = string2;
4870 end_match_2 = string2 + stop - size1;
4873 /* `p' scans through the pattern as `d' scans through the data.
4874 `dend' is the end of the input string that `d' points within. `d'
4875 is advanced into the following input string whenever necessary, but
4876 this happens before fetching; therefore, at the beginning of the
4877 loop, `d' can be pointing at the end of a string, but it cannot
4879 if (size1 > 0 && pos <= size1) {
4883 d = string2 + pos - size1;
4887 DEBUG_PRINT1("The compiled pattern is: \n");
4888 DEBUG_PRINT_COMPILED_PATTERN(bufp, p, pend);
4889 DEBUG_PRINT1("The string to match is: `");
4890 DEBUG_PRINT_DOUBLE_STRING(d, string1, size1, string2, size2);
4891 DEBUG_PRINT1("'\n");
4893 /* This loops over pattern commands. It exits by returning from the
4894 function if the match is complete, or it drops through if the match
4895 fails at this starting point in the input data. */
4897 DEBUG_PRINT2("\n0x%lx: ", (long)p);
4898 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4899 if (!no_quit_in_re_search)
4904 /* End of pattern means we might have succeeded. */
4905 DEBUG_PRINT1("end of pattern ... ");
4907 /* If we haven't matched the entire string, and we want
4908 the longest match, try backtracking. */
4909 if (d != end_match_2) {
4910 same_str_p = (FIRST_STRING_P(match_end)
4911 == MATCHING_IN_FIRST_STRING);
4913 /* AIX compiler got confused when this was
4914 combined with the previous declaration. */
4916 best_match_p = d > match_end;
4919 !MATCHING_IN_FIRST_STRING;
4921 DEBUG_PRINT1("backtracking.\n");
4923 if (!FAIL_STACK_EMPTY()) {
4924 /* More failure points to try. */
4926 /* If exceeds best match so far, save
4928 if (!best_regs_set || best_match_p) {
4929 best_regs_set = true;
4933 ("\nSAVING match as best so far.\n");
4935 for (mcnt = 1; mcnt < num_regs;
4937 best_regstart[mcnt] =
4946 /* If no failure points, don't restore garbage.
4947 And if last match is real best match, don't
4948 restore second best one. */
4949 else if (best_regs_set && !best_match_p) {
4951 /* Restore best match. It may happen
4952 that `dend == end_match_1' while the
4953 restored d is in string2. For
4954 example, the pattern `x.*y.*z'
4955 against the strings `x-' and `y-z-',
4956 if the two strings are not
4957 consecutive in memory. */
4959 ("Restoring best registers.\n");
4962 dend = ((d >= string1 && d <= end1)
4963 ? end_match_1 : end_match_2);
4965 for (mcnt = 1; mcnt < num_regs; mcnt++) {
4967 best_regstart[mcnt];
4973 /* d != end_match_2 */
4975 DEBUG_PRINT1("Accepting match.\n");
4977 int num_nonshy_regs = bufp->re_nsub + 1;
4978 /* If caller wants register contents data back,
4980 if (regs && !bufp->no_sub) {
4981 /* Have the register data arrays been
4983 if (bufp->regs_allocated == REGS_UNALLOCATED) {
4984 /* No. So allocate them with malloc.
4985 We need one extra element beyond
4986 `num_regs' for the `-1' marker GNU
4988 regs->num_regs = MAX(RE_NREGS, num_nonshy_regs + 1);
4989 regs->start = TALLOC(regs->num_regs, regoff_t);
4990 regs->end = TALLOC(regs->num_regs, regoff_t);
4991 if (regs->start == NULL || regs->end == NULL) {
4995 bufp->regs_allocated = REGS_REALLOCATE;
4996 } else if (bufp->regs_allocated == REGS_REALLOCATE) {
4997 /* Yes. If we need more elements than were already
4998 allocated, reallocate them. If we need fewer, just
5000 if (regs->num_regs < num_nonshy_regs + 1) {
5001 regs->num_regs = num_nonshy_regs + 1;
5002 RETALLOC(regs->start, regs->num_regs, regoff_t);
5003 RETALLOC(regs->end, regs->num_regs, regoff_t);
5004 if (regs->start == NULL || regs->end == NULL) {
5010 /* The braces fend off a "empty body in an else-statement"
5011 warning under GCC when assert expands to nothing. */
5012 assert (bufp->regs_allocated == REGS_FIXED);
5015 /* Convert the pointer data in `regstart' and `regend' to
5016 indices. Register zero has to be set differently,
5017 since we haven't kept track of any info for it. */
5018 if (regs->num_regs > 0) {
5019 regs->start[0] = pos;
5020 regs->end[0] = (MATCHING_IN_FIRST_STRING
5021 ? ((regoff_t) (d - string1))
5022 : ((regoff_t) (d - string2 + size1)));
5025 /* Map over the NUM_NONSHY_REGS non-shy internal registers.
5026 Copy each into the corresponding external register.
5027 N.B. MCNT indexes external registers. */
5029 mcnt < MIN (num_nonshy_regs, regs->num_regs);
5031 int ireg = bufp->external_to_internal_register[mcnt];
5033 if (REG_UNSET (regstart[ireg]) || REG_UNSET (regend[ireg]))
5034 regs->start[mcnt] = regs->end[mcnt] = -1;
5037 = (regoff_t) POINTER_TO_OFFSET (regstart[ireg]);
5039 = (regoff_t) POINTER_TO_OFFSET (regend[ireg]);
5042 } /* regs && !bufp->no_sub */
5044 /* If we have regs and the regs structure has
5045 more elements than were in the pattern, set
5046 the extra elements to -1. If we
5047 (re)allocated the registers, this is the
5048 case, because we always allocate enough to
5049 have at least one -1 at the end.
5051 We do this even when no_sub is set because
5052 some applications (XEmacs) reuse register
5053 structures which may contain stale
5054 information, and permit attempts to access
5057 It would be possible to require the caller to
5058 do this, but we'd have to change the API for
5059 this function to reflect that, and audit all
5061 if (regs && regs->num_regs > 0)
5062 for (mcnt = num_nonshy_regs; mcnt < regs->num_regs; mcnt++)
5063 regs->start[mcnt] = regs->end[mcnt] = -1;
5066 DEBUG_PRINT4("%u failure points pushed, %u popped (%u remain).\n",
5067 nfailure_points_pushed, nfailure_points_popped,
5068 nfailure_points_pushed - nfailure_points_popped);
5069 DEBUG_PRINT2("%u registers pushed.\n", num_regs_pushed);
5071 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
5075 DEBUG_PRINT2("Returning %d from re_match_2.\n", mcnt);
5081 /* Otherwise match next pattern command. */
5082 switch (SWITCH_ENUM_CAST((re_opcode_t) *p++)) {
5083 /* Ignore these. Used to ignore the n of succeed_n's
5084 which currently have n == 0. */
5086 DEBUG_PRINT1("EXECUTING no_op.\n");
5090 DEBUG_PRINT1("EXECUTING succeed.\n");
5093 /* Match the next n pattern characters exactly. The
5094 following byte in the pattern defines n, and the n
5095 bytes after that are the characters to match. */
5098 DEBUG_PRINT2("EXECUTING exactn %d.\n", mcnt);
5100 /* This is written out as an if-else so we don't waste
5101 time testing `translate' inside the loop. */
5102 if (TRANSLATE_P(translate)) {
5105 Emchar pat_ch, buf_ch;
5109 pat_ch = charptr_emchar(p);
5110 buf_ch = charptr_emchar(d);
5111 if (RE_TRANSLATE(buf_ch) != pat_ch)
5114 pat_len = charcount_to_bytecount(p, 1);
5119 #else /* not MULE */
5121 if ((unsigned char)RE_TRANSLATE(*d++) !=
5139 /* Match any character except possibly a newline or a
5142 DEBUG_PRINT1("EXECUTING anychar.\n");
5146 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5147 && TRANSLATE(*d) == '\n')
5148 || (bufp->syntax & RE_DOT_NOT_NULL
5149 && TRANSLATE(*d) == '\000'))
5153 DEBUG_PRINT2(" Matched `%d'.\n", *d);
5154 INC_CHARPTR(d); /* XEmacs change */
5159 REGISTER unsigned char c;
5161 (re_opcode_t) * (p - 1) == charset_not;
5163 DEBUG_PRINT2("EXECUTING charset%s.\n",
5164 not_p ? "_not" : "");
5167 /* The character to match. */
5170 /* Cast to `unsigned' instead of `unsigned char'
5171 in case the bit list is a full 32 bytes
5173 if (c < (unsigned)(*p * BYTEWIDTH)
5175 c / BYTEWIDTH] & (1 << (c %
5185 INC_CHARPTR(d); /* XEmacs change */
5191 case charset_mule_not: {
5194 (re_opcode_t) * (p - 1) == charset_mule_not;
5196 DEBUG_PRINT2("EXECUTING charset_mule%s.\n",
5197 not_p ? "_not" : "");
5200 c = charptr_emchar((const Bufbyte *)d);
5201 /* The character to match. */
5206 unified_range_table_lookup(p, c, Qnil)))
5209 p += unified_range_table_bytes_used(p);
5220 /* The beginning of a group is represented by
5221 start_memory. The arguments are the register number
5222 in the next byte, and the number of groups inner to
5223 this one in the next. The text matched within the
5224 group is recorded (in the internal registers data
5225 structure) under the register number. */
5227 DEBUG_PRINT3("EXECUTING start_memory %d (%d):\n", *p,
5230 /* Find out if this group can match the empty string. */
5231 p1 = p; /* To send to group_match_null_string_p. */
5233 if (REG_MATCH_NULL_STRING_P(reg_info[*p]) ==
5234 MATCH_NULL_UNSET_VALUE)
5235 REG_MATCH_NULL_STRING_P(reg_info[*p])
5236 = group_match_null_string_p(&p1, pend,
5239 /* Save the position in the string where we were the
5240 last time we were at this open-group operator in case
5241 the group is operated upon by a repetition operator,
5242 e.g., with `(a*)*b' against `ab'; then we want to
5243 ignore where we are now in the string in case this
5244 attempt to match fails. */
5245 old_regstart[*p] = REG_MATCH_NULL_STRING_P(reg_info[*p])
5246 ? REG_UNSET(regstart[*p]) ? d : regstart[*p]
5248 DEBUG_PRINT2(" old_regstart: %d\n",
5249 POINTER_TO_OFFSET(old_regstart[*p]));
5252 DEBUG_PRINT2(" regstart: %d\n",
5253 POINTER_TO_OFFSET(regstart[*p]));
5255 IS_ACTIVE(reg_info[*p]) = 1;
5256 MATCHED_SOMETHING(reg_info[*p]) = 0;
5258 /* Clear this whenever we change the register activity
5260 set_regs_matched_done = 0;
5262 /* This is the new highest active register. */
5263 highest_active_reg = *p;
5265 /* If nothing was active before, this is the new lowest
5267 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5268 lowest_active_reg = *p;
5270 /* Move past the register number and inner group
5273 just_past_start_mem = p;
5277 /* The stop_memory opcode represents the end of a group.
5278 Its arguments are the same as start_memory's: the
5279 register number, and the number of inner groups. */
5281 DEBUG_PRINT3("EXECUTING stop_memory %d (%d):\n", *p,
5284 /* We need to save the string position the last time we
5285 were at this close-group operator in case the group
5286 is operated upon by a repetition operator, e.g., with
5287 `((a*)*(b*)*)*' against `aba'; then we want to ignore
5288 where we are now in the string in case this attempt
5290 old_regend[*p] = REG_MATCH_NULL_STRING_P(reg_info[*p])
5291 ? REG_UNSET(regend[*p]) ? d : regend[*p]
5293 DEBUG_PRINT2(" old_regend: %d\n",
5294 POINTER_TO_OFFSET(old_regend[*p]));
5297 DEBUG_PRINT2(" regend: %d\n",
5298 POINTER_TO_OFFSET(regend[*p]));
5300 /* This register isn't active anymore. */
5301 IS_ACTIVE(reg_info[*p]) = 0;
5303 /* Clear this whenever we change the register activity
5305 set_regs_matched_done = 0;
5307 /* If this was the only register active, nothing is
5309 if (lowest_active_reg == highest_active_reg) {
5310 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5311 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5312 } else { /* We must scan for the new highest
5313 active register, since it isn't
5314 necessarily one less than now:
5315 consider (a(b)c(d(e)f)g). When group
5316 3 ends, after the f), the new highest
5317 active register is 1. */
5318 unsigned char r = *p - 1;
5319 while (r > 0 && !IS_ACTIVE(reg_info[r]))
5322 /* If we end up at register zero, that means
5323 that we saved the registers as the result of
5324 an `on_failure_jump', not a `start_memory',
5325 and we jumped to past the innermost
5326 `stop_memory'. For example, in ((.)*) we
5327 save registers 1 and 2 as a result of the *,
5328 but when we pop back to the second ), we are
5329 at the stop_memory 1. Thus, nothing is
5333 NO_LOWEST_ACTIVE_REG;
5334 highest_active_reg =
5335 NO_HIGHEST_ACTIVE_REG;
5337 highest_active_reg = r;
5339 /* 98/9/21 jhod: We've also gotta set
5340 lowest_active_reg, don't we? */
5342 while (r < highest_active_reg
5343 && !IS_ACTIVE(reg_info[r]))
5345 lowest_active_reg = r;
5349 /* If just failed to match something this time around
5350 with a group that's operated on by a repetition
5351 operator, try to force exit from the ``loop'', and
5352 restore the register information for this group that
5353 we had before trying this last match. */
5354 if ((!MATCHED_SOMETHING(reg_info[*p])
5355 || just_past_start_mem == p - 1)
5356 && (p + 2) < pend) {
5357 re_bool is_a_jump_n = false;
5361 switch ((unsigned int)(re_opcode_t)*p1++) {
5364 case pop_failure_jump:
5365 case maybe_pop_jump:
5367 case dummy_failure_jump:
5368 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
5378 /* If the next operation is a jump backwards in
5379 the pattern to an on_failure_jump right
5380 before the start_memory corresponding to this
5381 stop_memory, exit from the loop by forcing a
5382 failure after pushing on the stack the
5383 on_failure_jump's jump in the pattern, and
5386 && (re_opcode_t) * p1 == on_failure_jump
5387 && (re_opcode_t) p1[3] == start_memory
5389 /* If this group ever matched anything,
5390 then restore what its registers were
5391 before trying this last failed match,
5392 e.g., with `(a*)*b' against `ab' for
5393 regstart[1], and, e.g., with
5394 `((a*)*(b*)*)*' against `aba' for
5397 Also restore the registers for inner
5398 groups for, e.g., `((a*)(b*))*'
5399 against `aba' (register 3 would
5400 otherwise get trashed). */
5402 if (EVER_MATCHED_SOMETHING
5406 EVER_MATCHED_SOMETHING(reg_info
5410 /* Restore this and inner
5413 for (r = *p; r < *p + *(p + 1);
5418 /* xx why this test? */
5419 if (old_regend[r] >=
5427 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
5428 PUSH_FAILURE_POINT(p1 + mcnt, d, -2);
5434 /* Move past the register number and the inner group
5439 /* \<digit> has been turned into a `duplicate' command
5440 which is followed by the numeric value of <digit> as
5441 the register number. (Already passed through
5442 external-to-internal-register mapping, so it refers
5443 to the actual group number, not the non-shy-only
5444 numbering used in the external world.) */
5447 REGISTER re_char *d2, *dend2;
5448 /* Get which register to match against. */
5450 DEBUG_PRINT2("EXECUTING duplicate %d.\n",
5453 /* Can't back reference a group which we've
5455 if (REG_UNSET(regstart[regno])
5456 || REG_UNSET(regend[regno]))
5459 /* Where in input to try to start matching. */
5460 d2 = regstart[regno];
5462 /* Where to stop matching; if both the place to
5463 start and the place to stop matching are in
5464 the same string, then set to the place to
5465 stop, otherwise, for now have to use the end
5466 of the first string. */
5468 dend2 = ((FIRST_STRING_P(regstart[regno])
5469 == FIRST_STRING_P(regend[regno]))
5470 ? regend[regno] : end_match_1);
5472 /* If necessary, advance to next segment
5473 in register contents. */
5474 while (d2 == dend2) {
5475 if (dend2 == end_match_2)
5477 if (dend2 == regend[regno])
5480 /* End of string1 => advance to
5483 dend2 = regend[regno];
5485 /* At end of register contents =>
5490 /* If necessary, advance to next segment
5494 /* How many characters left in this segment to match. */
5497 /* Want how many consecutive characters
5498 we can match in one shot, so, if
5499 necessary, adjust the count. */
5500 if (mcnt > dend2 - d2)
5503 /* Compare that many; failure if
5504 mismatch, else move past them. */
5505 if (TRANSLATE_P(translate)
5507 (const unsigned char*)d,
5508 (const unsigned char*)d2,
5510 : memcmp(d, d2, mcnt)) {
5513 d += mcnt, d2 += mcnt;
5515 /* Do this because we've match some
5522 /* begline matches the empty string at the beginning of
5523 the string (unless `not_bol' is set in `bufp'), and,
5524 if `newline_anchor' is set, after newlines. */
5526 DEBUG_PRINT1("EXECUTING begline.\n");
5528 if (AT_STRINGS_BEG(d)) {
5531 } else if (d[-1] == '\n' && bufp->newline_anchor) {
5534 /* In all other cases, we fail. */
5537 /* endline is the dual of begline. */
5539 DEBUG_PRINT1("EXECUTING endline.\n");
5541 if (AT_STRINGS_END(d)) {
5546 /* We have to ``prefetch'' the next character. */
5547 else if ((d == end1 ? *string2 : *d) == '\n'
5548 && bufp->newline_anchor) {
5553 /* Match at the very beginning of the data. */
5555 DEBUG_PRINT1("EXECUTING begbuf.\n");
5556 if (AT_STRINGS_BEG(d))
5560 /* Match at the very end of the data. */
5562 DEBUG_PRINT1("EXECUTING endbuf.\n");
5563 if (AT_STRINGS_END(d))
5567 /* on_failure_keep_string_jump is used to optimize
5568 `.*\n'. It pushes NULL as the value for the string
5569 on the stack. Then `pop_failure_point' will keep the
5570 current value for the string, instead of restoring
5571 it. To see why, consider matching `foo\nbar' against
5572 `.*\n'. The .* matches the foo; then the . fails
5573 against the \n. But the next thing we want to do is
5574 match the \n against the \n; if we restored the
5575 string value, we would be back at the foo.
5577 Because this is used only in specific cases, we don't
5578 need to check all the things that `on_failure_jump'
5579 does, to make sure the right things get saved on the
5580 stack. Hence we don't share its code. The only
5581 reason to push anything on the stack at all is that
5582 otherwise we would have to change `anychar's code to
5583 do something besides goto fail in this case; that
5584 seems worse than this. */
5585 case on_failure_keep_string_jump:
5586 DEBUG_PRINT1("EXECUTING on_failure_keep_string_jump");
5588 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5589 DEBUG_PRINT3(" %d (to 0x%lx):\n", mcnt,
5592 PUSH_FAILURE_POINT(p + mcnt, (unsigned char *)0, -2);
5595 /* Uses of on_failure_jump:
5597 Each alternative starts with an on_failure_jump that
5598 points to the beginning of the next alternative.
5599 Each alternative except the last ends with a jump
5600 that in effect jumps past the rest of the
5601 alternatives. (They really jump to the ending jump
5602 of the following alternative, because tensioning
5603 these jumps is a hassle.)
5605 Repeats start with an on_failure_jump that points
5606 past both the repetition text and either the
5607 following jump or pop_failure_jump back to this
5609 case on_failure_jump:
5611 DEBUG_PRINT1("EXECUTING on_failure_jump");
5613 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5614 DEBUG_PRINT3(" %d (to 0x%lx)", mcnt, (long)(p + mcnt));
5616 /* If this on_failure_jump comes right before a group
5617 (i.e., the original * applied to a group), save the
5618 information for that group and all inner ones, so
5619 that if we fail back to this point, the group's
5620 information will be correct. For example, in
5621 \(a*\)*\1, we need the preceding group, and in
5622 \(\(a*\)b*\)\2, we need the inner group. */
5624 /* We can't use `p' to check ahead because we push
5625 a failure point to `p + mcnt' after we do this. */
5628 /* We need to skip no_op's before we look for the
5629 start_memory in case this on_failure_jump is
5630 happening as the result of a completed succeed_n, as
5631 in \(a\)\{1,3\}b\1 against aba. */
5632 while (p1 < pend && (re_opcode_t) * p1 == no_op)
5635 if (p1 < pend && (re_opcode_t) * p1 == start_memory) {
5636 /* We have a new highest active register now.
5637 This will get reset at the start_memory we
5638 are about to get to, but we will have saved
5639 all the registers relevant to this repetition
5640 op, as described above. */
5641 highest_active_reg = *(p1 + 1) + *(p1 + 2);
5642 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5643 lowest_active_reg = *(p1 + 1);
5646 DEBUG_PRINT1(":\n");
5647 PUSH_FAILURE_POINT(p + mcnt, d, -2);
5650 /* A smart repeat ends with `maybe_pop_jump'.
5651 We change it to either `pop_failure_jump' or `jump'. */
5652 case maybe_pop_jump:
5653 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5654 DEBUG_PRINT2("EXECUTING maybe_pop_jump %d.\n", mcnt);
5656 REGISTER unsigned char *p2 = p;
5658 /* Compare the beginning of the repeat with what
5659 in the pattern follows its end. If we can
5660 establish that there is nothing that they
5661 would both match, i.e., that we would have to
5662 backtrack because of (as in, e.g., `a*a')
5663 then we can change to pop_failure_jump,
5664 because we'll never have to backtrack.
5666 This is not true in the case of alternatives:
5667 in `(a|ab)*' we do need to backtrack to the
5668 `ab' alternative (e.g., if the string was
5669 `ab'). But instead of trying to detect that
5670 here, the alternative has put on a dummy
5671 failure point which is what we will end up
5674 /* Skip over open/close-group commands. If what
5675 follows this loop is a ...+ construct, look
5676 at what begins its body, since we will have
5677 to match at least one of that. */
5680 && ((re_opcode_t) * p2 ==
5682 || (re_opcode_t) * p2 ==
5685 else if (p2 + 6 < pend
5686 && (re_opcode_t) * p2 ==
5694 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
5695 to the `maybe_finalize_jump' of this case. Examine what
5698 /* If we're at the end of the pattern, we can change. */
5700 /* Consider what happens when matching ":\(.*\)"
5701 against ":/". I don't really understand this code
5703 p[-3] = (unsigned char)pop_failure_jump;
5705 (" End of pattern: change to `pop_failure_jump'.\n");
5708 else if ((re_opcode_t) * p2 == exactn
5709 || (bufp->newline_anchor
5710 && (re_opcode_t) * p2 ==
5712 REGISTER unsigned char c =
5714 (unsigned char)endline ? '\n' :
5717 if ((re_opcode_t) p1[3] == exactn
5723 (" %c != %c => pop_failure_jump.\n",
5727 else if ((re_opcode_t) p1[3] == charset
5728 || (re_opcode_t) p1[3] ==
5731 (re_opcode_t) p1[3] ==
5735 (unsigned char)(p1[4] *
5739 BYTEWIDTH] & (1 << (c
5744 /* `not_p' is equal to 1 if c would match, which means
5745 that we can't change to pop_failure_jump. */
5751 (" No match => pop_failure_jump.\n");
5754 } else if ((re_opcode_t) * p2 == charset) {
5755 #ifdef REGEX_DEBUG_FLAG
5756 REGISTER unsigned char c
5759 (unsigned char)endline ? '\n' :
5763 if ((re_opcode_t) p1[3] == exactn
5764 && !((int)p2[1] * BYTEWIDTH >
5766 && (p2[2 + p1[5] / BYTEWIDTH]
5774 (" %c != %c => pop_failure_jump.\n",
5778 else if ((re_opcode_t) p1[3] ==
5781 /* We win if the charset_not inside the loop
5782 lists every character listed in the charset after. */
5783 for (idx = 0; idx < (int)p2[1];
5801 (" No match => pop_failure_jump.\n");
5803 } else if ((re_opcode_t) p1[3] ==
5806 /* We win if the charset inside the loop
5807 has no overlap with the one after the loop. */
5810 && idx < (int)p1[4]; idx++)
5821 (" No match => pop_failure_jump.\n");
5826 p -= 2; /* Point at relative address again. */
5827 if ((re_opcode_t) p[-1] != pop_failure_jump) {
5828 p[-1] = (unsigned char)jump;
5829 DEBUG_PRINT1(" Match => jump.\n");
5830 goto unconditional_jump;
5832 /* Note fall through. */
5834 /* The end of a simple repeat has a pop_failure_jump
5835 back to its matching on_failure_jump, where the
5836 latter will push a failure point. The
5837 pop_failure_jump takes off failure points put on by
5838 this pop_failure_jump's matching on_failure_jump; we
5839 got through the pattern to here from the matching
5840 on_failure_jump, so didn't fail. */
5841 case pop_failure_jump: {
5842 /* We need to pass separate storage for the
5843 lowest and highest registers, even though we
5844 don't care about the actual values.
5845 Otherwise, we will restore only one register
5846 from the stack, since lowest will == highest
5847 in `pop_failure_point'. */
5848 int dummy_low_reg, dummy_high_reg;
5850 re_char *sdummy = NULL;
5852 DEBUG_PRINT1("EXECUTING pop_failure_jump.\n");
5853 POP_FAILURE_POINT(sdummy, pdummy,
5854 dummy_low_reg, dummy_high_reg,
5855 reg_dummy, reg_dummy,
5858 /* Note fall through. */
5860 /* Unconditionally jump (without popping any failure
5864 /* Get the amount to jump. */
5865 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5866 DEBUG_PRINT2("EXECUTING jump %d ", mcnt);
5867 p += mcnt; /* Do the jump. */
5868 DEBUG_PRINT2("(to 0x%lx).\n", (long)p);
5871 /* We need this opcode so we can detect where alternatives end
5872 in `group_match_null_string_p' et al. */
5874 DEBUG_PRINT1("EXECUTING jump_past_alt.\n");
5875 goto unconditional_jump;
5877 /* Normally, the on_failure_jump pushes a failure point, which
5878 then gets popped at pop_failure_jump. We will end up at
5879 pop_failure_jump, also, and with a pattern of, say, `a+', we
5880 are skipping over the on_failure_jump, so we have to push
5881 something meaningless for pop_failure_jump to pop. */
5882 case dummy_failure_jump:
5883 DEBUG_PRINT1("EXECUTING dummy_failure_jump.\n");
5884 /* It doesn't matter what we push for the string here. What
5885 the code at `fail' tests is the value for the pattern. */
5886 PUSH_FAILURE_POINT(NULL, NULL, -2);
5887 goto unconditional_jump;
5889 /* At the end of an alternative, we need to push a dummy failure
5890 point in case we are followed by a `pop_failure_jump', because
5891 we don't want the failure point for the alternative to be
5892 popped. For example, matching `(a|ab)*' against `aab'
5893 requires that we match the `ab' alternative. */
5894 case push_dummy_failure:
5895 DEBUG_PRINT1("EXECUTING push_dummy_failure.\n");
5896 /* See comments just above at `dummy_failure_jump' about the
5898 PUSH_FAILURE_POINT(NULL, NULL, -2);
5901 /* Have to succeed matching what follows at least n times.
5902 After that, handle like `on_failure_jump'. */
5904 EXTRACT_NUMBER(mcnt, p + 2);
5905 DEBUG_PRINT2("EXECUTING succeed_n %d.\n", mcnt);
5908 /* Originally, this is how many times we HAVE to succeed. */
5912 STORE_NUMBER_AND_INCR(p, mcnt);
5913 DEBUG_PRINT3(" Setting 0x%lx to %d.\n",
5915 } else if (mcnt == 0) {
5917 (" Setting two bytes from 0x%lx to no_op.\n",
5919 p[2] = (unsigned char)no_op;
5920 p[3] = (unsigned char)no_op;
5926 EXTRACT_NUMBER(mcnt, p + 2);
5927 DEBUG_PRINT2("EXECUTING jump_n %d.\n", mcnt);
5929 /* Originally, this is how many times we CAN jump. */
5932 STORE_NUMBER(p + 2, mcnt);
5933 goto unconditional_jump;
5935 /* If don't have to jump any more, skip over the rest of command. */
5942 DEBUG_PRINT1("EXECUTING set_number_at.\n");
5944 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5946 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5947 DEBUG_PRINT3(" Setting 0x%lx to %d.\n",
5949 STORE_NUMBER(p1, mcnt);
5954 DEBUG_PRINT1("EXECUTING wordbound.\n");
5959 /* Straightforward and (I hope) correct implementation.
5960 Probably should be optimized by arranging to compute
5962 /* emch1 is the character before d, syn1 is the syntax of
5963 emch1, emch2 is the character at d, and syn2 is the
5965 Emchar emch1, emch2;
5966 /* GCC isn't smart enough to see these are initialized if used. */
5967 int syn1 = 0, syn2 = 0;
5968 re_char *d_before, *d_after;
5970 at_beg = AT_STRINGS_BEG(d),
5971 at_end = AT_STRINGS_END(d);
5976 if (at_beg && at_end) {
5981 POS_BEFORE_GAP_UNSAFE(d);
5982 DEC_CHARPTR(d_before);
5984 charptr_emchar(d_before);
5987 SYNTAX_CACHE_BYTE_TO_CHAR
5988 (PTR_TO_OFFSET(d)) - 1;
5989 UPDATE_SYNTAX_CACHE(xpos);
5991 syn1 = SYNTAX_FROM_CACHE
5993 (regex_emacs_buffer->
5994 mirror_syntax_table),
5999 POS_AFTER_GAP_UNSAFE(d);
6000 emch2 = charptr_emchar(d_after);
6003 SYNTAX_CACHE_BYTE_TO_CHAR
6005 UPDATE_SYNTAX_CACHE_FORWARD(xpos
6009 syn2 = SYNTAX_FROM_CACHE
6011 (regex_emacs_buffer->
6012 mirror_syntax_table),
6017 result = (syn2 == Sword);
6019 result = (syn1 == Sword);
6026 if (result == should_succeed)
6032 DEBUG_PRINT1("EXECUTING notwordbound.\n");
6034 goto matchwordbound;
6037 DEBUG_PRINT1("EXECUTING wordbeg.\n");
6038 if (AT_STRINGS_END(d))
6041 /* XEmacs: this originally read:
6043 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
6047 re_char *dtmp = POS_AFTER_GAP_UNSAFE(d);
6048 Emchar emch = charptr_emchar(dtmp);
6051 SYNTAX_CACHE_BYTE_TO_CHAR(PTR_TO_OFFSET(d));
6052 UPDATE_SYNTAX_CACHE(charpos);
6054 if (SYNTAX_FROM_CACHE
6056 (regex_emacs_buffer->mirror_syntax_table),
6059 if (AT_STRINGS_BEG(d))
6061 dtmp = POS_BEFORE_GAP_UNSAFE(d);
6063 emch = charptr_emchar(dtmp);
6065 UPDATE_SYNTAX_CACHE_BACKWARD(charpos - 1);
6067 if (SYNTAX_FROM_CACHE
6069 (regex_emacs_buffer->mirror_syntax_table),
6076 DEBUG_PRINT1("EXECUTING wordend.\n");
6077 if (AT_STRINGS_BEG(d))
6080 /* XEmacs: this originally read:
6082 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
6083 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
6086 The or condition is incorrect (reversed).
6092 SYNTAX_CACHE_BYTE_TO_CHAR(PTR_TO_OFFSET(d))
6094 UPDATE_SYNTAX_CACHE(charpos);
6096 dtmp = POS_BEFORE_GAP_UNSAFE(d);
6098 emch = charptr_emchar(dtmp);
6099 if (SYNTAX_FROM_CACHE
6101 (regex_emacs_buffer->mirror_syntax_table),
6104 if (AT_STRINGS_END(d))
6106 dtmp = POS_AFTER_GAP_UNSAFE(d);
6107 emch = charptr_emchar(dtmp);
6109 UPDATE_SYNTAX_CACHE_FORWARD(charpos + 1);
6111 if (SYNTAX_FROM_CACHE
6113 (regex_emacs_buffer->mirror_syntax_table),
6121 DEBUG_PRINT1("EXECUTING before_dot.\n");
6123 (NILP(regex_match_object)
6124 || BUFFERP(regex_match_object))
6125 || (BUF_PTR_BYTE_POS(regex_emacs_buffer, d)
6126 >= BUF_PT(regex_emacs_buffer)))
6131 DEBUG_PRINT1("EXECUTING at_dot.\n");
6133 (NILP(regex_match_object)
6134 || BUFFERP(regex_match_object))
6135 || (BUF_PTR_BYTE_POS(regex_emacs_buffer, d)
6136 != BUF_PT(regex_emacs_buffer)))
6141 DEBUG_PRINT1("EXECUTING after_dot.\n");
6143 (NILP(regex_match_object)
6144 || BUFFERP(regex_match_object))
6145 || (BUF_PTR_BYTE_POS(regex_emacs_buffer, d)
6146 <= BUF_PT(regex_emacs_buffer)))
6149 #if 0 /* not emacs19 */
6151 DEBUG_PRINT1("EXECUTING at_dot.\n");
6152 if (BUF_PTR_BYTE_POS
6153 (regex_emacs_buffer,
6154 (unsigned char *)d) + 1 !=
6155 BUF_PT(regex_emacs_buffer))
6158 #endif /* not emacs19 */
6161 DEBUG_PRINT2("EXECUTING syntaxspec %d.\n", mcnt);
6166 DEBUG_PRINT1("EXECUTING Emacs wordchar.\n");
6179 SYNTAX_CACHE_BYTE_TO_CHAR
6181 UPDATE_SYNTAX_CACHE(charpos);
6185 emch = charptr_emchar((const Bufbyte *)d);
6189 (regex_emacs_buffer->mirror_syntax_table),
6190 emch) == (enum syntaxcode)mcnt);
6192 if (matches != should_succeed)
6199 DEBUG_PRINT2("EXECUTING notsyntaxspec %d.\n", mcnt);
6201 goto matchnotsyntax;
6204 DEBUG_PRINT1("EXECUTING Emacs notwordchar.\n");
6208 goto matchornotsyntax;
6211 /* 97/2/17 jhod Mule category code patch */
6220 emch = charptr_emchar((const Bufbyte *)d);
6222 if (check_category_char
6223 (emch, regex_emacs_buffer->category_table,
6224 mcnt, should_succeed))
6230 case notcategoryspec:
6232 goto matchornotcategory;
6233 /* end of category patch */
6235 #else /* not emacs */
6237 DEBUG_PRINT1("EXECUTING non-Emacs wordchar.\n");
6239 if (!WORDCHAR_P_UNSAFE((int)(*d)))
6246 DEBUG_PRINT1("EXECUTING non-Emacs notwordchar.\n");
6248 if (!WORDCHAR_P_UNSAFE((int)(*d)))
6258 /* Successfully executed one pattern command; keep going. */
6261 /* We goto here if a matching operation fails. */
6263 if (!FAIL_STACK_EMPTY()) {
6264 /* A restart point is known. Restore to that state. */
6265 DEBUG_PRINT1("\nFAIL:\n");
6266 POP_FAILURE_POINT(d, p /* used to be const char* */,
6267 lowest_active_reg, highest_active_reg,
6268 regstart, regend, reg_info);
6270 /* If this failure point is a dummy, try the next one. */
6274 /* If we failed to the end of the pattern, don't examine
6278 re_bool is_a_jump_n = false;
6280 /* If failed to a backwards jump that's part of
6281 a repetition loop, need to pop this failure
6282 point and use the next one. */
6283 switch ((unsigned int)(re_opcode_t)*p) {
6286 case maybe_pop_jump:
6287 case pop_failure_jump:
6290 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6294 && (re_opcode_t) * p1 == succeed_n)
6296 && (re_opcode_t) * p1 ==
6305 if (d >= string1 && d <= end1)
6308 break; /* Matching at this starting point really fails. */
6312 goto restore_best_regs;
6316 return -1; /* Failure to match. */
6319 /* Subroutine definitions for re_match_2. */
6321 /* We are passed P pointing to a register number after a start_memory.
6323 Return true if the pattern up to the corresponding stop_memory can
6324 match the empty string, and false otherwise.
6326 If we find the matching stop_memory, sets P to point to one past its number.
6327 Otherwise, sets P to an undefined byte less than or equal to END.
6329 We don't handle duplicates properly (yet). */
6332 group_match_null_string_p(unsigned char **p, unsigned char *end,
6333 register_info_type * register_info)
6336 /* Point to after the args to the start_memory. */
6337 unsigned char *p1 = *p + 2;
6340 /* Skip over opcodes that can match nothing, and return true or
6341 false, as appropriate, when we get to one that can't, or to the
6342 matching stop_memory. */
6344 switch ((unsigned int)(re_opcode_t)*p1) {
6345 /* Could be either a loop or a series of alternatives. */
6346 case on_failure_jump:
6348 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6350 /* If the next operation is not a jump backwards in the
6354 /* Go through the on_failure_jumps of the alternatives,
6355 seeing if any of the alternatives cannot match nothing.
6356 The last alternative starts with only a jump,
6357 whereas the rest start with on_failure_jump and end
6358 with a jump, e.g., here is the pattern for `a|b|c':
6360 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
6361 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
6364 So, we have to first go through the first (n-1)
6365 alternatives and then deal with the last one separately. */
6367 /* Deal with the first (n-1) alternatives, which start
6368 with an on_failure_jump (see above) that jumps to right
6369 past a jump_past_alt. */
6371 while ((re_opcode_t) p1[mcnt - 3] ==
6373 /* `mcnt' holds how many bytes long the alternative
6374 is, including the ending `jump_past_alt' and
6377 if (!alt_match_null_string_p
6378 (p1, p1 + mcnt - 3, register_info))
6381 /* Move to right after this alternative, including the
6385 /* Break if it's the beginning of an n-th alternative
6386 that doesn't begin with an on_failure_jump. */
6387 if ((re_opcode_t) * p1 !=
6391 /* Still have to check that it's not an n-th
6392 alternative that starts with an on_failure_jump. */
6394 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6395 if ((re_opcode_t) p1[mcnt - 3] !=
6397 /* Get to the beginning of the n-th alternative. */
6403 /* Deal with the last alternative: go back and get number
6404 of the `jump_past_alt' just before it. `mcnt' contains
6405 the length of the alternative. */
6406 EXTRACT_NUMBER(mcnt, p1 - 2);
6408 if (!alt_match_null_string_p
6409 (p1, p1 + mcnt, register_info))
6412 p1 += mcnt; /* Get past the n-th alternative. */
6417 assert(p1[1] == **p);
6422 if (!common_op_match_null_string_p(&p1, end, register_info))
6425 } /* while p1 < end */
6428 } /* group_match_null_string_p */
6430 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
6431 It expects P to be the first byte of a single alternative and END one
6432 byte past the last. The alternative can contain groups. */
6435 alt_match_null_string_p(unsigned char *p, unsigned char *end,
6436 register_info_type * register_info)
6439 unsigned char *p1 = p;
6442 /* Skip over opcodes that can match nothing, and break when we get
6443 to one that can't. */
6445 switch ((unsigned int)(re_opcode_t)*p1) {
6447 case on_failure_jump:
6449 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6454 if (!common_op_match_null_string_p(&p1, end, register_info))
6457 } /* while p1 < end */
6460 } /* alt_match_null_string_p */
6462 /* Deals with the ops common to group_match_null_string_p and
6463 alt_match_null_string_p.
6465 Sets P to one after the op and its arguments, if any. */
6468 common_op_match_null_string_p(unsigned char **p, unsigned char *end,
6469 register_info_type * register_info)
6474 unsigned char *p1 = *p;
6476 switch ((unsigned int)(re_opcode_t)*p1++) {
6495 assert(reg_no > 0 && reg_no <= MAX_REGNUM);
6496 ret = group_match_null_string_p(&p1, end, register_info);
6498 /* Have to set this here in case we're checking a group which
6499 contains a group and a back reference to it. */
6501 if (REG_MATCH_NULL_STRING_P(register_info[reg_no]) ==
6502 MATCH_NULL_UNSET_VALUE)
6503 REG_MATCH_NULL_STRING_P(register_info[reg_no]) = ret;
6509 /* If this is an optimized succeed_n for zero times, make the jump. */
6511 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6519 /* Get to the number of times to succeed. */
6521 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6525 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6532 if (!REG_MATCH_NULL_STRING_P(register_info[*p1]))
6540 /* All other opcodes mean we cannot match the empty string. */
6546 } /* common_op_match_null_string_p */
6548 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6549 bytes; nonzero otherwise. */
6552 bcmp_translate(re_char *s1, re_char *s2,
6553 REGISTER int len, RE_TRANSLATE_TYPE translate)
6555 REGISTER const unsigned char *p1 = s1, *p2 = s2;
6557 const unsigned char *p1_end = s1 + len;
6558 const unsigned char *p2_end = s2 + len;
6560 while (p1 != p1_end && p2 != p2_end) {
6561 Emchar p1_ch, p2_ch;
6563 p1_ch = charptr_emchar(p1);
6564 p2_ch = charptr_emchar(p2);
6566 if (RE_TRANSLATE(p1_ch)
6567 != RE_TRANSLATE(p2_ch))
6572 #else /* not MULE */
6574 if (RE_TRANSLATE(*p1++) != RE_TRANSLATE(*p2++))
6582 /* Entry points for GNU code. */
6584 /* re_compile_pattern is the GNU regular expression compiler: it
6585 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6586 Returns 0 if the pattern was valid, otherwise an error string.
6588 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6589 are set in BUFP on entry.
6591 We call regex_compile to do the actual compilation. */
6593 const char *re_compile_pattern(const char *pattern, int length,
6594 struct re_pattern_buffer *bufp)
6598 /* GNU code is written to assume at least RE_NREGS registers will be set
6599 (and at least one extra will be -1). */
6600 bufp->regs_allocated = REGS_UNALLOCATED;
6602 /* And GNU code determines whether or not to get register information
6603 by passing null for the REGS argument to re_match, etc., not by
6607 /* Match anchors at newline. */
6608 bufp->newline_anchor = 1;
6610 ret = regex_compile((const unsigned char*)pattern,
6611 length, re_syntax_options, bufp);
6615 return gettext(re_error_msgid[(int)ret]);
6618 /* Entry points compatible with 4.2 BSD regex library. We don't define
6619 them unless specifically requested. */
6621 #ifdef _REGEX_RE_COMP
6623 /* BSD has one and only one pattern buffer. */
6624 static struct re_pattern_buffer re_comp_buf;
6626 char *re_comp(const char *s)
6631 if (!re_comp_buf.buffer)
6632 return gettext("No previous regular expression");
6636 if (!re_comp_buf.buffer) {
6637 re_comp_buf.buffer = (unsigned char *)xmalloc_atomic(200);
6638 if (re_comp_buf.buffer == NULL)
6639 return gettext(re_error_msgid[(int)REG_ESPACE]);
6640 re_comp_buf.allocated = 200;
6642 re_comp_buf.fastmap = (char *)xmalloc_atomic(1 << BYTEWIDTH);
6643 if (re_comp_buf.fastmap == NULL)
6644 return gettext(re_error_msgid[(int)REG_ESPACE]);
6647 /* Since `re_exec' always passes NULL for the `regs' argument, we
6648 don't need to initialize the pattern buffer fields which affect it. */
6650 /* Match anchors at newlines. */
6651 re_comp_buf.newline_anchor = 1;
6654 regex_compile((unsigned char *)s, strlen(s), re_syntax_options,
6660 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6661 return (char *)gettext(re_error_msgid[(int)ret]);
6664 int re_exec(const char *s)
6666 const int len = strlen(s);
6668 0 <= re_search(&re_comp_buf, s, len, 0, len,
6669 (struct re_registers *)0);
6671 #endif /* _REGEX_RE_COMP */
6673 /* POSIX.2 functions. Don't define these for Emacs. */
6677 /* regcomp takes a regular expression as a string and compiles it.
6679 PREG is a regex_t *. We do not expect any fields to be initialized,
6680 since POSIX says we shouldn't. Thus, we set
6682 `buffer' to the compiled pattern;
6683 `used' to the length of the compiled pattern;
6684 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6685 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6686 RE_SYNTAX_POSIX_BASIC;
6687 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6688 `fastmap' and `fastmap_accurate' to zero;
6689 `re_nsub' to the number of subexpressions in PATTERN.
6690 (non-shy of course. POSIX probably doesn't know about
6691 shy ones, and in any case they should be invisible.)
6693 PATTERN is the address of the pattern string.
6695 CFLAGS is a series of bits which affect compilation.
6697 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6698 use POSIX basic syntax.
6700 If REG_NEWLINE is set, then . and [^...] don't match newline.
6701 Also, regexec will try a match beginning after every newline.
6703 If REG_ICASE is set, then we considers upper- and lowercase
6704 versions of letters to be equivalent when matching.
6706 If REG_NOSUB is set, then when PREG is passed to regexec, that
6707 routine will report only success or failure, and nothing about the
6710 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6711 the return codes and their meanings.) */
6713 int regcomp(regex_t * preg, const char *pattern, int cflags)
6717 = (cflags & REG_EXTENDED) ?
6718 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6720 /* regex_compile will allocate the space for the compiled pattern. */
6722 preg->allocated = 0;
6725 /* Don't bother to use a fastmap when searching. This simplifies the
6726 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
6727 characters after newlines into the fastmap. This way, we just try
6731 if (cflags & REG_ICASE) {
6734 preg->translate = (char *)xmalloc_atomic(CHAR_SET_SIZE);
6735 if (preg->translate == NULL)
6736 return (int)REG_ESPACE;
6738 /* Map uppercase characters to corresponding lowercase ones. */
6739 for (i = 0; i < CHAR_SET_SIZE; i++)
6740 preg->translate[i] = ISUPPER(i) ? tolower(i) : i;
6742 preg->translate = NULL;
6744 /* If REG_NEWLINE is set, newlines are treated differently. */
6745 if (cflags & REG_NEWLINE) {
6746 /* REG_NEWLINE implies neither . nor [^...] match newline. */
6747 syntax &= ~RE_DOT_NEWLINE;
6748 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6749 /* It also changes the matching behavior. */
6750 preg->newline_anchor = 1;
6752 preg->newline_anchor = 0;
6754 preg->no_sub = !!(cflags & REG_NOSUB);
6756 /* POSIX says a null character in the pattern terminates it, so we
6757 can use strlen here in compiling the pattern. */
6758 ret = regex_compile((const unsigned char*)pattern,
6759 strlen(pattern), syntax, preg);
6761 /* POSIX doesn't distinguish between an unmatched open-group and an
6762 unmatched close-group: both are REG_EPAREN. */
6763 if (ret == REG_ERPAREN)
6769 /* regexec searches for a given pattern, specified by PREG, in the
6772 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6773 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6774 least NMATCH elements, and we set them to the offsets of the
6775 corresponding matched substrings.
6777 EFLAGS specifies `execution flags' which affect matching: if
6778 REG_NOTBOL is set, then ^ does not match at the beginning of the
6779 string; if REG_NOTEOL is set, then $ does not match at the end.
6781 We return 0 if we find a match and REG_NOMATCH if not. */
6784 regexec(const regex_t * preg, const char *string, Element_count nmatch,
6785 regmatch_t pmatch[], int eflags)
6788 struct re_registers regs;
6789 regex_t private_preg;
6790 int len = strlen(string);
6791 re_bool want_reg_info = !preg->no_sub && nmatch > 0;
6793 private_preg = *preg;
6795 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6796 private_preg.not_eol = !!(eflags & REG_NOTEOL);
6798 /* The user has told us exactly how many registers to return
6799 information about, via `nmatch'. We have to pass that on to the
6800 matching routines. */
6801 private_preg.regs_allocated = REGS_FIXED;
6803 if (want_reg_info) {
6804 regs.num_regs = nmatch;
6805 regs.start = TALLOC(nmatch, regoff_t);
6806 regs.end = TALLOC(nmatch, regoff_t);
6807 if (regs.start == NULL || regs.end == NULL)
6808 return (int)REG_NOMATCH;
6811 /* Perform the searching operation. */
6812 ret = re_search(&private_preg, string, len,
6813 /* start: */ 0, /* range: */ len,
6814 want_reg_info ? ®s : (struct re_registers *)0);
6816 /* Copy the register information to the POSIX structure. */
6817 if (want_reg_info) {
6821 for (r = 0; r < nmatch; r++) {
6822 pmatch[r].rm_so = regs.start[r];
6823 pmatch[r].rm_eo = regs.end[r];
6827 /* If we needed the temporary register info, free the space now. */
6832 /* We want zero return to mean success, unlike `re_search'. */
6833 return ret >= 0 ? (int)REG_NOERROR : (int)REG_NOMATCH;
6836 /* Returns a message corresponding to an error code, ERRCODE, returned
6837 from either regcomp or regexec. We don't use PREG here. */
6840 regerror(int errcode, const regex_t * preg, char *errbuf,
6841 Memory_count errbuf_size)
6844 Memory_count msg_size;
6846 if (errcode < 0 || (size_t) errcode >= (sizeof(re_error_msgid)
6847 / sizeof(re_error_msgid[0])))
6848 /* Only error codes returned by the rest of the code should be passed
6849 to this routine. If we are given anything else, or if other regex
6850 code generates an invalid error code, then the program has a bug.
6851 Dump core so we can fix it. */
6854 msg = gettext(re_error_msgid[errcode]);
6856 msg_size = strlen(msg) + 1; /* Includes the null. */
6858 if (errbuf_size != 0) {
6859 strncpy(errbuf, msg, errbuf_size - 1);
6860 errbuf[errbuf_size-1]='\0';
6866 /* Free dynamically allocated space used by PREG. */
6868 void regfree(regex_t * preg)
6870 if (preg->buffer != NULL) {
6871 xfree(preg->buffer);
6873 preg->buffer = NULL;
6875 preg->allocated = 0;
6878 if (preg->fastmap != NULL) {
6879 xfree(preg->fastmap);
6881 preg->fastmap = NULL;
6882 preg->fastmap_accurate = 0;
6884 if (preg->translate != NULL) {
6885 xfree(preg->translate);
6887 preg->translate = NULL;
6890 #endif /* not emacs */
6894 make-backup-files: t
6896 trim-versions-without-asking: nil