1 /* Extended regular expression matching and search library,
2 version 0.12, extended for XEmacs.
3 (Implements POSIX draft P10003.2/D11.2, except for
4 internationalization features.)
6 Copyright (C) 1993, 1994, 1995 Free Software Foundation, Inc.
7 Copyright (C) 1995 Sun Microsystems, Inc.
8 Copyright (C) 1995 Ben Wing.
10 This file is part of SXEmacs
12 SXEmacs is free software: you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation, either version 3 of the License, or
15 (at your option) any later version.
17 SXEmacs is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
22 You should have received a copy of the GNU General Public License
23 along with this program. If not, see <http://www.gnu.org/licenses/>. */
26 /* Synched up with: FSF 19.29. */
28 /* Changes made for XEmacs:
30 (1) the REGEX_BEGLINE_CHECK code from the XEmacs v18 regex routines
31 was added. This causes a huge speedup in font-locking.
32 (2) Rel-alloc is disabled when the MMAP version of rel-alloc is
33 being used, because it's too slow -- all those calls to mmap()
34 add humongous overhead.
35 (3) Lots and lots of changes for Mule. They are bracketed by
36 `#ifdef MULE' or with comments that have `XEmacs' in them.
43 #ifndef REGISTER /* Rigidly enforced as of 20.3 */
52 /* Converts the pointer to the char to BEG-based offset from the start. */
53 #define PTR_TO_OFFSET(d) (MATCHING_IN_FIRST_STRING \
54 ? (d) - string1 : (d) - (string2 - size1))
56 #define PTR_TO_OFFSET(d) 0
59 /* We assume non-Mule if emacs isn't defined. */
64 /* We need this for `regex.h', and perhaps for the Emacs include files. */
65 #include <sys/types.h>
67 /* This is for other GNU distributions with internationalized messages. */
68 #if defined (I18N3) && (defined (HAVE_LIBINTL_H) || defined (_LIBC))
71 # define gettext(msgid) (msgid)
74 /* XEmacs: define this to add in a speedup for patterns anchored at
75 the beginning of a line. Keep the ifdefs so that it's easier to
76 tell where/why this code has diverged from v19. */
77 #define REGEX_BEGLINE_CHECK
79 /* XEmacs: the current mmap-based ralloc handles small blocks very
80 poorly, so we disable it here. */
82 #if (defined (REL_ALLOC) && defined (HAVE_MMAP)) || defined(DOUG_LEA_MALLOC)
86 /* The `emacs' switch turns on certain matching commands
87 that make sense only in Emacs. */
96 Lisp_Object Vthe_lisp_rangetab;
98 void complex_vars_of_regex(void)
100 Vthe_lisp_rangetab = Fmake_range_table();
101 staticpro(&Vthe_lisp_rangetab);
106 void complex_vars_of_regex(void)
112 #define RE_TRANSLATE(ch) TRT_TABLE_OF (translate, (Emchar) ch)
113 #define TRANSLATE_P(tr) (!NILP (tr) && CHAR_TABLEP(tr))
115 /* just massage that xfree thing */
117 #if defined HAVE_BDWGC && defined EF_USE_BDWGC
125 /* If we are not linking with Emacs proper,
126 we can't use the relocating allocator
127 even if config.h says that we can. */
130 #if defined (STDC_HEADERS) || defined (_LIBC)
132 # include <stdbool.h>
138 /* Types normally included via lisp.h */
139 #include <stddef.h> /* for ptrdiff_t */
142 #ifndef DECLARE_NOTHING
143 #define DECLARE_NOTHING struct nosuchstruct
149 #define charptr_emchar(str) ((Emchar) (str)[0])
151 #define INC_CHARPTR(p) ((p)++)
152 #define DEC_CHARPTR(p) ((p)--)
156 /* Define the syntax stuff for \<, \>, etc. */
158 /* This must be nonzero for the wordchar and notwordchar pattern
159 commands in re_match_2. */
166 extern char *re_syntax_table;
168 #else /* not SYNTAX_TABLE */
170 /* How many characters in the character set. */
171 #define CHAR_SET_SIZE 256
173 static char re_syntax_table[CHAR_SET_SIZE];
175 static void init_syntax_once(void)
180 const char *word_syntax_chars =
181 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_";
183 memset(re_syntax_table, 0, sizeof(re_syntax_table));
185 while (*word_syntax_chars)
186 re_syntax_table[(unsigned int)(*word_syntax_chars++)] =
193 #endif /* SYNTAX_TABLE */
195 #define SYNTAX_UNSAFE(ignored, c) re_syntax_table[c]
196 #undef SYNTAX_FROM_CACHE
197 #define SYNTAX_FROM_CACHE SYNTAX_UNSAFE
199 #define RE_TRANSLATE(c) translate[(unsigned char) (c)]
200 #define TRANSLATE_P(tr) tr
202 #if defined HAVE_BDWGC && defined EF_USE_BDWGC
203 # if defined HAVE_GC_GC_H
205 # elif defined HAVE_GC_H
208 # define xmalloc GC_MALLOC
209 # define xmalloc_atomic GC_MALLOC_ATOMIC
210 # define xrealloc GC_REALLOC
211 # define xfree GC_FREE
212 # define xstrdup GC_STRDUP
213 #else /* !HAVE_BDWGC */
214 # define xmalloc malloc
215 # define xmalloc_atomic malloc
216 # define xrealloc realloc
218 # define xstrdup strdup
219 #endif /* HAVE_BDWGC */
224 /* Under XEmacs, this is needed because we don't define it elsewhere. */
225 #ifdef SWITCH_ENUM_BUG
226 #define SWITCH_ENUM_CAST(x) ((int)(x))
228 #define SWITCH_ENUM_CAST(x) (x)
231 /* Get the interface, including the syntax bits. */
234 /* isalpha etc. are used for the character classes. */
237 /* Jim Meyering writes:
239 "... Some ctype macros are valid only for character codes that
240 isascii says are ASCII (SGI's IRIX-4.0.5 is one such system --when
241 using /bin/cc or gcc but without giving an ansi option). So, all
242 ctype uses should be through macros like ISPRINT... If
243 STDC_HEADERS is defined, then autoconf has verified that the ctype
244 macros don't need to be guarded with references to isascii. ...
245 Defining isascii to 1 should let any compiler worth its salt
246 eliminate the && through constant folding." */
248 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
249 #define ISASCII_1(c) 1
251 #define ISASCII_1(c) isascii(c)
255 /* The IS*() macros can be passed any character, including an extended
256 one. We need to make sure there are no crashes, which would occur
257 otherwise due to out-of-bounds array references. */
258 #define ISASCII(c) (((EMACS_UINT) (c)) < 0x100 && ISASCII_1 (c))
260 #define ISASCII(c) ISASCII_1 (c)
264 #define ISBLANK(c) (ISASCII (c) && isblank (c))
266 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
269 #define ISGRAPH(c) (ISASCII (c) && isgraph (c))
271 #define ISGRAPH(c) (ISASCII (c) && isprint (c) && !isspace (c))
274 /* stolen from FSF/src/charset.h */
275 #define SINGLE_BYTE_CHAR_P(c) (((unsigned int)(c) & 0xFF) == (c))
276 /* 1 if C is an ASCII character. */
277 #define IS_REAL_ASCII(c) ((c) < 128)
278 /* 1 if C is a unibyte character. */
279 #define ISUNIBYTE(c) (SINGLE_BYTE_CHAR_P(c))
281 #define ISPRINT(c) (ISASCII (c) && isprint (c))
282 #define ISDIGIT(c) (ISASCII (c) && isdigit (c))
283 #define ISALNUM(c) (ISASCII (c) && isalnum (c))
284 #define ISALPHA(c) (ISASCII (c) && isalpha (c))
285 #define ISCNTRL(c) (ISASCII (c) && iscntrl (c))
286 #define ISLOWER(c) (ISASCII (c) && islower (c))
287 #define ISPUNCT(c) (ISASCII (c) && ispunct (c))
288 #define ISSPACE(c) (ISASCII (c) && isspace (c))
289 #define ISUPPER(c) (ISASCII (c) && isupper (c))
290 #define ISXDIGIT(c) (ISASCII (c) && isxdigit (c))
295 __attribute__((always_inline));
299 Lisp_Object ct = regex_emacs_buffer->mirror_syntax_table;
301 return (SYNTAX(XCHAR_TABLE(ct), c) == Sword);
304 # define ISWORD(c) (ISALPHA(c))
308 #define NULL (void *)0
311 /* We remove any previous definition of `SIGN_EXTEND_CHAR',
312 since ours (we hope) works properly with all combinations of
313 machines, compilers, `char' and `unsigned char' argument types.
314 (Per Bothner suggested the basic approach.) */
315 #undef SIGN_EXTEND_CHAR
317 #define SIGN_EXTEND_CHAR(c) ((signed char) (c))
318 #else /* not __STDC__ */
319 /* As in Harbison and Steele. */
320 #define SIGN_EXTEND_CHAR(c) ((((unsigned char) (c)) ^ 128) - 128)
323 /* Should we use malloc or alloca? If REGEX_MALLOC is not defined, we
324 use `alloca' instead of `malloc'. This is because using malloc in
325 re_search* or re_match* could cause memory leaks when C-g is used in
326 Emacs; also, malloc is slower and causes storage fragmentation. On
327 the other hand, malloc is more portable, and easier to debug.
329 Because we sometimes use alloca, some routines have to be macros,
330 not functions -- `alloca'-allocated space disappears at the end of the
331 function it is called in. */
335 #define REGEX_ALLOCATE xmalloc
336 #define REGEX_REALLOCATE(source, osize, nsize) xrealloc(source, nsize)
337 #define REGEX_FREE xfree
339 #else /* !REGEX_MALLOC */
341 /* Emacs already defines alloca, sometimes. */
344 /* Make alloca work the best possible way. */
346 #define alloca __builtin_alloca
347 #else /* not __GNUC__ */
350 #else /* not __GNUC__ or HAVE_ALLOCA_H */
351 #ifndef _AIX /* Already did AIX, up at the top. */
353 #endif /* not _AIX */
354 #endif /* HAVE_ALLOCA_H */
355 #endif /* __GNUC__ */
357 #endif /* not alloca */
359 #define REGEX_ALLOCATE alloca
361 /* Assumes a `char *destination' variable. */
362 #define REGEX_REALLOCATE(source, osize, nsize) \
363 (destination = (char *) alloca (nsize), \
364 memmove (destination, source, osize), \
367 /* No need to do anything to free, after alloca. */
368 #define REGEX_FREE(arg) ((void)0) /* Do nothing! But inhibit gcc warning. */
370 #endif /* REGEX_MALLOC */
372 /* Define how to allocate the failure stack. */
376 #define REGEX_ALLOCATE_STACK(size) \
377 r_alloc ((char **) &failure_stack_ptr, (size))
378 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
379 r_re_alloc ((char **) &failure_stack_ptr, (nsize))
380 #define REGEX_FREE_STACK(ptr) \
381 r_alloc_free ((void **) &failure_stack_ptr)
383 #else /* !REL_ALLOC */
387 #define REGEX_ALLOCATE_STACK xmalloc
388 #define REGEX_REALLOCATE_STACK(source, osize, nsize) xrealloc(source, nsize)
389 #define REGEX_FREE_STACK xfree
391 #else /* !REGEX_MALLOC */
393 #define REGEX_ALLOCATE_STACK alloca
395 #define REGEX_REALLOCATE_STACK(source, osize, nsize) \
396 REGEX_REALLOCATE (source, osize, nsize)
397 /* No need to explicitly free anything. */
398 #define REGEX_FREE_STACK(arg)
400 #endif /* REGEX_MALLOC */
401 #endif /* REL_ALLOC */
403 /* True if `size1' is non-NULL and PTR is pointing anywhere inside
404 `string1' or just past its end. This works if PTR is NULL, which is
406 #define FIRST_STRING_P(ptr) \
407 (size1 && string1 <= (ptr) && (ptr) <= string1 + size1)
409 /* (Re)Allocate N items of type T using malloc, or fail. */
410 #define TALLOC(n, t) \
411 ((t *)xmalloc((n) * sizeof (t)))
412 #define TALLOC_ATOMIC(n, t) \
413 ((t *)xmalloc_atomic((n) * sizeof (t)))
414 #define RETALLOC(addr, n, t) \
415 ((addr) = (t *)xrealloc(addr, (n) * sizeof (t)))
416 #define RETALLOC_IF(addr, n, t) \
418 RETALLOC((addr), (n), t); \
420 (addr) = TALLOC ((n), t)
421 #define REGEX_TALLOC(n, t) \
422 ((t *)REGEX_ALLOCATE((n) * sizeof (t)))
424 #define BYTEWIDTH 8 /* In bits. */
426 #define STREQ(s1, s2) (strcmp (s1, s2) == 0)
430 #define MAX(a, b) ((a) > (b) ? (a) : (b))
431 #define MIN(a, b) ((a) < (b) ? (a) : (b))
433 /* Type of source-pattern and string chars. */
434 typedef const unsigned char re_char;
436 typedef char re_bool;
440 /* These are the command codes that appear in compiled regular
441 expressions. Some opcodes are followed by argument bytes. A
442 command code can specify any interpretation whatsoever for its
443 arguments. Zero bytes may appear in the compiled regular expression. */
448 /* Succeed right away--no more backtracking. */
451 /* Followed by one byte giving n, then by n literal bytes. */
454 /* Matches any (more or less) character. */
457 /* Matches any one char belonging to specified set. First
458 following byte is number of bitmap bytes. Then come bytes
459 for a bitmap saying which chars are in. Bits in each byte
460 are ordered low-bit-first. A character is in the set if its
461 bit is 1. A character too large to have a bit in the map is
462 automatically not in the set. */
465 /* Same parameters as charset, but match any character that is
466 not one of those specified. */
469 /* Start remembering the text that is matched, for storing in a
470 register. Followed by one byte with the register number, in
471 the range 1 to the pattern buffer's re_ngroups
472 field. Then followed by one byte with the number of groups
473 inner to this one. (This last has to be part of the
474 start_memory only because we need it in the on_failure_jump
478 /* Stop remembering the text that is matched and store it in a
479 memory register. Followed by one byte with the register
480 number, in the range 1 to `re_ngroups' in the
481 pattern buffer, and one byte with the number of inner groups,
482 just like `start_memory'. (We need the number of inner
483 groups here because we don't have any easy way of finding the
484 corresponding start_memory when we're at a stop_memory.) */
487 /* Match a duplicate of something remembered. Followed by one
488 byte containing the register number. */
491 /* Fail unless at beginning of line. */
494 /* Fail unless at end of line. */
497 /* Succeeds if at beginning of buffer (if emacs) or at beginning
498 of string to be matched (if not). */
501 /* Analogously, for end of buffer/string. */
504 /* Followed by two byte relative address to which to jump. */
507 /* Same as jump, but marks the end of an alternative. */
510 /* Followed by two-byte relative address of place to resume at
511 in case of failure. */
514 /* Like on_failure_jump, but pushes a placeholder instead of the
515 current string position when executed. */
516 on_failure_keep_string_jump,
518 /* Throw away latest failure point and then jump to following
519 two-byte relative address. */
522 /* Change to pop_failure_jump if know won't have to backtrack to
523 match; otherwise change to jump. This is used to jump
524 back to the beginning of a repeat. If what follows this jump
525 clearly won't match what the repeat does, such that we can be
526 sure that there is no use backtracking out of repetitions
527 already matched, then we change it to a pop_failure_jump.
528 Followed by two-byte address. */
531 /* Jump to following two-byte address, and push a dummy failure
532 point. This failure point will be thrown away if an attempt
533 is made to use it for a failure. A `+' construct makes this
534 before the first repeat. Also used as an intermediary kind
535 of jump when compiling an alternative. */
538 /* Push a dummy failure point and continue. Used at the end of
542 /* Followed by two-byte relative address and two-byte number n.
543 After matching N times, jump to the address upon failure. */
546 /* Followed by two-byte relative address, and two-byte number n.
547 Jump to the address N times, then fail. */
550 /* Set the following two-byte relative address to the
551 subsequent two-byte number. The address *includes* the two
555 wordchar, /* Matches any word-constituent character. */
556 notwordchar, /* Matches any char that is not a word-constituent. */
558 wordbeg, /* Succeeds if at word beginning. */
559 wordend, /* Succeeds if at word end. */
561 wordbound, /* Succeeds if at a word boundary. */
562 notwordbound /* Succeeds if not at a word boundary. */
564 , before_dot, /* Succeeds if before point. */
565 at_dot, /* Succeeds if at point. */
566 after_dot, /* Succeeds if after point. */
568 /* Matches any character whose syntax is specified. Followed by
569 a byte which contains a syntax code, e.g., Sword. */
572 /* Matches any character whose syntax is not that specified. */
576 /* need extra stuff to be able to properly work with XEmacs/Mule
577 characters (which may take up more than one byte) */
578 , charset_mule, /* Matches any character belonging to specified set.
579 The set is stored in "unified range-table
580 format"; see rangetab.c. Unlike the `charset'
581 opcode, this can handle arbitrary characters. */
583 charset_mule_not /* Same parameters as charset_mule, but match any
584 character that is not one of those specified. */
585 /* 97/2/17 jhod: The following two were merged back in from the Mule
586 2.3 code to enable some language specific processing */
587 , categoryspec, /* Matches entries in the character category tables */
588 notcategoryspec /* The opposite of the above */
592 /* Common operations on the compiled pattern. */
594 /* Store NUMBER in two contiguous bytes starting at DESTINATION. */
596 #define STORE_NUMBER(destination, number) \
598 (destination)[0] = (number) & 0377; \
599 (destination)[1] = (number) >> 8; \
602 /* Same as STORE_NUMBER, except increment DESTINATION to
603 the byte after where the number is stored. Therefore, DESTINATION
604 must be an lvalue. */
606 #define STORE_NUMBER_AND_INCR(destination, number) \
608 STORE_NUMBER (destination, number); \
609 (destination) += 2; \
612 /* Put into DESTINATION a number stored in two contiguous bytes starting
615 #define EXTRACT_NUMBER(destination, source) \
617 (destination) = *(source) & 0377; \
618 (destination) += SIGN_EXTEND_CHAR (*((source) + 1)) << 8; \
621 #ifdef REGEX_DEBUG_FLAG
622 static void extract_number(int *dest, re_char * source)
624 int temp = SIGN_EXTEND_CHAR(*(source + 1));
625 *dest = *source & 0377;
629 #ifndef EXTRACT_MACROS /* To debug the macros. */
630 #undef EXTRACT_NUMBER
631 #define EXTRACT_NUMBER(dest, src) extract_number (&dest, src)
632 #endif /* not EXTRACT_MACROS */
634 #endif /* REGEX_DEBUG_FLAG */
636 /* Same as EXTRACT_NUMBER, except increment SOURCE to after the number.
637 SOURCE must be an lvalue. */
639 #define EXTRACT_NUMBER_AND_INCR(destination, source) \
641 EXTRACT_NUMBER (destination, source); \
645 #ifdef REGEX_DEBUG_FLAG
647 extract_number_and_incr(int *destination, unsigned char **source)
649 extract_number(destination, *source);
653 #ifndef EXTRACT_MACROS
654 #undef EXTRACT_NUMBER_AND_INCR
655 #define EXTRACT_NUMBER_AND_INCR(dest, src) \
656 extract_number_and_incr (&dest, &src)
657 #endif /* not EXTRACT_MACROS */
659 #endif /* REGEX_DEBUG_FLAG */
661 /* If REGEX_DEBUG_FLAG is defined, Regex prints many voluminous messages about
662 what it is doing (if the variable `debug' is nonzero). If linked with the
663 main program in `iregex.c', you can enter patterns and strings interactively.
664 And if linked with the main program in `main.c' and the other test files, you
665 can run the already-written tests. */
667 #if defined (REGEX_DEBUG_FLAG)
669 /* We use standard I/O for debugging. */
673 /* XEmacs provides its own version of assert() */
674 /* It is useful to test things that ``must'' be true when debugging. */
679 /* no point in having this one, since we do not offer any accessor to this */
680 static int debug = 0;
683 #define DEBUG_STATEMENT(e) e
684 #define DEBUG_PRINT1(x) printf(x)
685 #define DEBUG_PRINT2(x1, x2) printf(x1, x2)
686 #define DEBUG_PRINT3(x1, x2, x3) printf(x1, x2, x3)
687 #define DEBUG_PRINT4(x1, x2, x3, x4) printf(x1, x2, x3, x4)
688 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e) \
689 print_partial_compiled_pattern(s, e)
690 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2) \
691 print_double_string (w, s1, sz1, s2, sz2)
693 /* Print the fastmap in human-readable form. */
695 static void print_fastmap(char *fastmap)
697 unsigned was_a_range = 0;
700 while (i < (1 << BYTEWIDTH)) {
704 while (i < (1 << BYTEWIDTH) && fastmap[i]) {
717 /* Print a compiled pattern string in human-readable form, starting at
718 the START pointer into it and ending just before the pointer END. */
720 /* can't help it ... the ugly mule code changes `start' upon alignment,
721 * so this can't have the const qualifier ... bugger */
723 print_partial_compiled_pattern(unsigned char *start, re_char *end)
726 unsigned char *p = start;
734 /* Loop over pattern commands. */
736 printf("%ld:\t", (long)(p - start));
738 switch ((const re_opcode_t)*p++) {
745 printf("/exactn/%d", mcnt);
755 printf("/start_memory/%d/%d", mcnt, *p++);
760 printf("/stop_memory/%d/%d", mcnt, *p++);
764 printf("/duplicate/%d", *p++);
773 REGISTER int c, last = -100;
774 REGISTER int in_range = 0;
776 printf("/charset [%s",
777 (re_opcode_t) * (p - 1) == charset_not
780 assert(p + *p < pend);
782 for (c = 0; c < 256; c++)
783 if (((unsigned char)(c / 8) < *p)
786 /* Are we starting a range? */
787 if (last + 1 == c && !in_range) {
791 /* Have we broken a range? */
792 else if (last + 1 != c
815 case charset_mule_not: {
818 printf("/charset_mule [%s",
819 (re_opcode_t) * (p - 1) == charset_mule_not
821 /* u_r_t_nentries() CHANGES its arg, why on earth
822 * is this marked const here then? :O -hrop */
823 nentries = unified_range_table_nentries(p);
824 for (i = 0; i < nentries; i++) {
825 EMACS_INT first, last;
826 Lisp_Object dummy_val;
828 /* u_r_t_get_range CHANGES its arg ...
829 * p can't be const thence */
830 unified_range_table_get_range(p, i,
837 printf("(0x%lx)", (long)first);
848 p += unified_range_table_bytes_used(p);
861 case on_failure_jump:
862 extract_number_and_incr(&mcnt, &p);
863 printf("/on_failure_jump to %ld",
864 (long)(p + mcnt - start));
867 case on_failure_keep_string_jump:
868 extract_number_and_incr(&mcnt, &p);
869 printf("/on_failure_keep_string_jump to %ld",
870 (long)(p + mcnt - start));
873 case dummy_failure_jump:
874 extract_number_and_incr(&mcnt, &p);
875 printf("/dummy_failure_jump to %ld",
876 (long)(p + mcnt - start));
879 case push_dummy_failure:
880 printf("/push_dummy_failure");
884 extract_number_and_incr(&mcnt, &p);
885 printf("/maybe_pop_jump to %ld",
886 (long)(p + mcnt - start));
889 case pop_failure_jump:
890 extract_number_and_incr(&mcnt, &p);
891 printf("/pop_failure_jump to %ld",
892 (long)(p + mcnt - start));
896 extract_number_and_incr(&mcnt, &p);
897 printf("/jump_past_alt to %ld",
898 (long)(p + mcnt - start));
902 extract_number_and_incr(&mcnt, &p);
903 printf("/jump to %ld", (long)(p + mcnt - start));
907 extract_number_and_incr(&mcnt, &p);
908 extract_number_and_incr(&mcnt2, &p);
909 printf("/succeed_n to %ld, %d times",
910 (long)(p + mcnt - start), mcnt2);
914 extract_number_and_incr(&mcnt, &p);
915 extract_number_and_incr(&mcnt2, &p);
916 printf("/jump_n to %ld, %d times",
917 (long)(p + mcnt - start), mcnt2);
921 extract_number_and_incr(&mcnt, &p);
922 extract_number_and_incr(&mcnt2, &p);
923 printf("/set_number_at location %ld to %d",
924 (long)(p + mcnt - start), mcnt2);
928 printf("/wordbound");
932 printf("/notwordbound");
944 printf("/before_dot");
952 printf("/after_dot");
956 printf("/syntaxspec");
962 printf("/notsyntaxspec");
968 /* 97/2/17 jhod Mule category patch */
970 printf("/categoryspec");
975 case notcategoryspec:
976 printf("/notcategoryspec");
980 /* end of category patch */
989 printf("/notwordchar");
1002 printf("?%d", *(p - 1));
1008 printf("%ld:\tend of pattern.\n", (long)(p - start));
1012 print_compiled_pattern(struct re_pattern_buffer *bufp)
1016 re_char *buffer = bufp->buffer;
1018 unsigned char *buffer = bufp->buffer;
1021 print_partial_compiled_pattern(buffer, buffer + bufp->used);
1022 printf("%ld bytes used/%ld bytes allocated.\n", bufp->used,
1025 if (bufp->fastmap_accurate && bufp->fastmap) {
1026 printf("fastmap: ");
1027 print_fastmap(bufp->fastmap);
1030 printf("re_nsub: %ld\t", (long)bufp->re_nsub);
1031 printf("re_ngroups: %ld\t", (long)bufp->re_ngroups);
1032 printf("regs_alloc: %d\t", bufp->regs_allocated);
1033 printf("can_be_null: %d\t", bufp->can_be_null);
1034 printf("newline_anchor: %d\n", bufp->newline_anchor);
1035 printf("no_sub: %d\t", bufp->no_sub);
1036 printf("not_bol: %d\t", bufp->not_bol);
1037 printf("not_eol: %d\t", bufp->not_eol);
1038 printf("syntax: %d\n", bufp->syntax);
1039 /* Perhaps we should print the translate table? */
1040 /* and maybe the category table? */
1042 if (bufp->external_to_internal_register) {
1045 printf("external_to_internal_register:\n");
1046 for (i = 0; i <= bufp->re_nsub; i++) {
1049 printf("%d -> %d", i,
1050 bufp->external_to_internal_register[i]);
1057 print_double_string(re_char * where, re_char * string1, int size1,
1058 re_char * string2, int size2)
1063 Element_count this_char;
1065 if (FIRST_STRING_P(where)) {
1066 for (this_char = where - string1; this_char < size1;
1068 putchar(string1[this_char]);
1073 for (this_char = where - string2; this_char < size2;
1075 putchar(string2[this_char]);
1079 #else /* not REGEX_DEBUG_FLAG */
1084 #define DEBUG_STATEMENT(e)
1085 #define DEBUG_PRINT1(x)
1086 #define DEBUG_PRINT2(x1, x2)
1087 #define DEBUG_PRINT3(x1, x2, x3)
1088 #define DEBUG_PRINT4(x1, x2, x3, x4)
1089 #define DEBUG_PRINT_COMPILED_PATTERN(p, s, e)
1090 #define DEBUG_PRINT_DOUBLE_STRING(w, s1, sz1, s2, sz2)
1092 #endif /* REGEX_DEBUG_FLAG */
1094 /* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
1095 also be assigned to arbitrarily: each pattern buffer stores its own
1096 syntax, so it can be changed between regex compilations. */
1097 /* This has no initializer because initialized variables in Emacs
1098 become read-only after dumping. */
1099 reg_syntax_t re_syntax_options;
1101 /* Specify the precise syntax of regexps for compilation. This provides
1102 for compatibility for various utilities which historically have
1103 different, incompatible syntaxes.
1105 The argument SYNTAX is a bit mask comprised of the various bits
1106 defined in regex.h. We return the old syntax. */
1108 reg_syntax_t re_set_syntax(reg_syntax_t syntax)
1110 reg_syntax_t ret = re_syntax_options;
1112 re_syntax_options = syntax;
1116 /* This table gives an error message for each of the error codes listed
1117 in regex.h. Obviously the order here has to be same as there.
1118 POSIX doesn't require that we do anything for REG_NOERROR,
1119 but why not be nice? */
1121 static const char *re_error_msgid[] = {
1122 "Success", /* REG_NOERROR */
1123 "No match", /* REG_NOMATCH */
1124 "Invalid regular expression", /* REG_BADPAT */
1125 "Invalid collation character", /* REG_ECOLLATE */
1126 "Invalid character class name", /* REG_ECTYPE */
1127 "Trailing backslash", /* REG_EESCAPE */
1128 "Invalid back reference", /* REG_ESUBREG */
1129 "Unmatched [ or [^", /* REG_EBRACK */
1130 "Unmatched ( or \\(", /* REG_EPAREN */
1131 "Unmatched \\{", /* REG_EBRACE */
1132 "Invalid content of \\{\\}", /* REG_BADBR */
1133 "Invalid range end", /* REG_ERANGE */
1134 "Memory exhausted", /* REG_ESPACE */
1135 "Invalid preceding regular expression", /* REG_BADRPT */
1136 "Premature end of regular expression", /* REG_EEND */
1137 "Regular expression too big", /* REG_ESIZE */
1138 "Unmatched ) or \\)", /* REG_ERPAREN */
1140 "Invalid syntax designator", /* REG_ESYNTAX */
1143 "Ranges may not span charsets", /* REG_ERANGESPAN */
1144 "Invalid category designator", /* REG_ECATEGORY */
1148 /* Avoiding alloca during matching, to placate r_alloc. */
1150 /* Define MATCH_MAY_ALLOCATE unless we need to make sure that the
1151 searching and matching functions should not call alloca. On some
1152 systems, alloca is implemented in terms of malloc, and if we're
1153 using the relocating allocator routines, then malloc could cause a
1154 relocation, which might (if the strings being searched are in the
1155 ralloc heap) shift the data out from underneath the regexp
1158 Here's another reason to avoid allocation: Emacs
1159 processes input from X in a signal handler; processing X input may
1160 call malloc; if input arrives while a matching routine is calling
1161 malloc, then we're scrod. But Emacs can't just block input while
1162 calling matching routines; then we don't notice interrupts when
1163 they come in. So, Emacs blocks input around all regexp calls
1164 except the matching calls, which it leaves unprotected, in the
1165 faith that they will not malloc. */
1167 /* Normally, this is fine. */
1168 #define MATCH_MAY_ALLOCATE
1170 /* When using GNU C, we are not REALLY using the C alloca, no matter
1171 what config.h may say. So don't take precautions for it. */
1176 /* The match routines may not allocate if (1) they would do it with malloc
1177 and (2) it's not safe for them to use malloc.
1178 Note that if REL_ALLOC is defined, matching would not use malloc for the
1179 failure stack, but we would still use it for the register vectors;
1180 so REL_ALLOC should not affect this. */
1181 #if (defined (C_ALLOCA) || defined (REGEX_MALLOC)) && defined (emacs)
1182 #undef MATCH_MAY_ALLOCATE
1185 /* Failure stack declarations and macros; both re_compile_fastmap and
1186 re_match_2 use a failure stack. These have to be macros because of
1187 REGEX_ALLOCATE_STACK. */
1189 /* Number of failure points for which to initially allocate space
1190 when matching. If this number is exceeded, we allocate more
1191 space, so it is not a hard limit. */
1192 #ifndef INIT_FAILURE_ALLOC
1193 #define INIT_FAILURE_ALLOC 20
1196 /* Roughly the maximum number of failure points on the stack. Would be
1197 exactly that if always used MAX_FAILURE_SPACE each time we failed.
1198 This is a variable only so users of regex can assign to it; we never
1199 change it ourselves. */
1200 #if defined (MATCH_MAY_ALLOCATE) || defined (REGEX_MALLOC)
1201 /* 4400 was enough to cause a crash on Alpha OSF/1,
1202 whose default stack limit is 2mb. */
1203 int re_max_failures = 50000;
1205 int re_max_failures = 2000;
1208 union fail_stack_elt {
1209 const unsigned char *pointer;
1213 typedef union fail_stack_elt fail_stack_elt_t;
1216 fail_stack_elt_t *stack;
1218 Element_count avail; /* Offset of next open position. */
1221 #define FAIL_STACK_EMPTY() (fail_stack.avail == 0)
1222 #define FAIL_STACK_PTR_EMPTY() (fail_stack_ptr->avail == 0)
1223 #define FAIL_STACK_FULL() (fail_stack.avail == fail_stack.size)
1225 /* Define macros to initialize and free the failure stack.
1226 Do `return -2' if the alloc fails. */
1228 #ifdef MATCH_MAY_ALLOCATE
1229 #define INIT_FAIL_STACK() \
1231 fail_stack.stack = (fail_stack_elt_t *) \
1232 REGEX_ALLOCATE_STACK (INIT_FAILURE_ALLOC * \
1233 sizeof (fail_stack_elt_t)); \
1235 if (fail_stack.stack == NULL) \
1238 fail_stack.size = INIT_FAILURE_ALLOC; \
1239 fail_stack.avail = 0; \
1242 #define RESET_FAIL_STACK() REGEX_FREE_STACK (fail_stack.stack)
1244 #define INIT_FAIL_STACK() \
1246 fail_stack.avail = 0; \
1249 #define RESET_FAIL_STACK()
1252 /* Double the size of FAIL_STACK, up to approximately `re_max_failures' items.
1254 Return 1 if succeeds, and 0 if either ran out of memory
1255 allocating space for it or it was already too large.
1257 REGEX_REALLOCATE_STACK requires `destination' be declared. */
1259 #define DOUBLE_FAIL_STACK(fail_stack) \
1260 ((int) (fail_stack).size > re_max_failures * MAX_FAILURE_ITEMS \
1262 : ((fail_stack).stack = (fail_stack_elt_t *) \
1263 REGEX_REALLOCATE_STACK ((fail_stack).stack, \
1264 (fail_stack).size * sizeof (fail_stack_elt_t), \
1265 ((fail_stack).size << 1) * sizeof (fail_stack_elt_t)), \
1267 (fail_stack).stack == NULL \
1269 : ((fail_stack).size <<= 1, \
1272 /* Push pointer POINTER on FAIL_STACK.
1273 Return 1 if was able to do so and 0 if ran out of memory allocating
1275 #define PUSH_PATTERN_OP(POINTER, FAIL_STACK) \
1276 ((FAIL_STACK_FULL () \
1277 && !DOUBLE_FAIL_STACK (FAIL_STACK)) \
1279 : ((FAIL_STACK).stack[(FAIL_STACK).avail++].pointer = POINTER, \
1282 /* Push a pointer value onto the failure stack.
1283 Assumes the variable `fail_stack'. Probably should only
1284 be called from within `PUSH_FAILURE_POINT'. */
1285 #define PUSH_FAILURE_POINTER(item) \
1286 fail_stack.stack[fail_stack.avail++].pointer = \
1287 (const unsigned char*)(item)
1289 /* This pushes an integer-valued item onto the failure stack.
1290 Assumes the variable `fail_stack'. Probably should only
1291 be called from within `PUSH_FAILURE_POINT'. */
1292 #define PUSH_FAILURE_INT(item) \
1293 fail_stack.stack[fail_stack.avail++].integer = (item)
1295 /* Push a fail_stack_elt_t value onto the failure stack.
1296 Assumes the variable `fail_stack'. Probably should only
1297 be called from within `PUSH_FAILURE_POINT'. */
1298 #define PUSH_FAILURE_ELT(item) \
1299 fail_stack.stack[fail_stack.avail++] = (item)
1301 /* These three POP... operations complement the three PUSH... operations.
1302 All assume that `fail_stack' is nonempty. */
1303 #define POP_FAILURE_POINTER() fail_stack.stack[--fail_stack.avail].pointer
1304 #define POP_FAILURE_INT() fail_stack.stack[--fail_stack.avail].integer
1305 #define POP_FAILURE_ELT() fail_stack.stack[--fail_stack.avail]
1307 /* Used to omit pushing failure point id's when we're not debugging. */
1308 #ifdef REGEX_DEBUG_FLAG
1309 #define DEBUG_PUSH PUSH_FAILURE_INT
1310 #define DEBUG_POP(item_addr) *(item_addr) = POP_FAILURE_INT ()
1312 #define DEBUG_PUSH(item)
1313 #define DEBUG_POP(item_addr)
1316 /* Push the information about the state we will need
1317 if we ever fail back to it.
1319 Requires variables fail_stack, regstart, regend, reg_info, and
1320 num_regs be declared. DOUBLE_FAIL_STACK requires `destination' be
1323 Does `return FAILURE_CODE' if runs out of memory. */
1325 #if !defined (REGEX_MALLOC) && !defined (REL_ALLOC)
1326 #define DECLARE_DESTINATION char *destination
1328 #define DECLARE_DESTINATION DECLARE_NOTHING
1331 #define PUSH_FAILURE_POINT(pattern_place, string_place, failure_code) \
1333 DECLARE_DESTINATION; \
1334 /* Must be int, so when we don't save any registers, the\
1335 arithmetic \ of 0 + -1 isn't done as unsigned. */ \
1338 DEBUG_STATEMENT (failure_id++); \
1339 DEBUG_STATEMENT (nfailure_points_pushed++); \
1340 DEBUG_PRINT2 ("\nPUSH_FAILURE_POINT #%u:\n", failure_id); \
1341 DEBUG_PRINT2 (" Before push, next avail: %lu\n", \
1342 (long unsigned int)(fail_stack).avail); \
1343 DEBUG_PRINT2 (" size: %lu\n", \
1344 (long unsigned int)(fail_stack).size); \
1346 DEBUG_PRINT2 (" slots needed: %d\n", NUM_FAILURE_ITEMS); \
1347 DEBUG_PRINT2 (" available: %ld\n", \
1348 (long) REMAINING_AVAIL_SLOTS); \
1350 /* Ensure we have enough space allocated for what \
1351 * we will push. */ \
1352 while (REMAINING_AVAIL_SLOTS < NUM_FAILURE_ITEMS) { \
1353 if (!DOUBLE_FAIL_STACK (fail_stack)) \
1354 return failure_code; \
1356 DEBUG_PRINT2 ("\n Doubled stack; size now: %lu\n", \
1357 (unsigned long) (fail_stack).size); \
1358 DEBUG_PRINT2 (" slots available: %ld\n", \
1359 (long) REMAINING_AVAIL_SLOTS); \
1362 /* Push the info, starting with the registers. */ \
1363 DEBUG_PRINT1 ("\n"); \
1365 for (this_reg = lowest_active_reg; \
1366 this_reg <= highest_active_reg; \
1368 DEBUG_PRINT2 (" Pushing reg: %d\n", this_reg); \
1369 DEBUG_STATEMENT (num_regs_pushed++); \
1371 DEBUG_PRINT2(" start: %p\n", \
1372 regstart[this_reg]); \
1373 PUSH_FAILURE_POINTER(regstart[this_reg]); \
1375 DEBUG_PRINT2 (" end: %p\n", \
1376 regend[this_reg]); \
1377 PUSH_FAILURE_POINTER(regend[this_reg]); \
1379 DEBUG_PRINT2 (" info: 0x%p\n ", \
1380 ®_info[this_reg]); \
1381 DEBUG_PRINT2 (" match_null=%d", \
1382 REG_MATCH_NULL_STRING_P \
1383 (reg_info[this_reg])); \
1384 DEBUG_PRINT2 (" active=%d", \
1385 IS_ACTIVE (reg_info[this_reg])); \
1386 DEBUG_PRINT2 (" matched_something=%d", \
1388 (reg_info[this_reg])); \
1389 DEBUG_PRINT2 (" ever_matched_something=%d", \
1390 EVER_MATCHED_SOMETHING \
1391 (reg_info[this_reg])); \
1392 DEBUG_PRINT1 ("\n"); \
1393 PUSH_FAILURE_ELT(reg_info[this_reg].word); \
1396 DEBUG_PRINT2 (" Pushing low active reg: %d\n", \
1397 lowest_active_reg); \
1398 PUSH_FAILURE_INT (lowest_active_reg); \
1400 DEBUG_PRINT2 (" Pushing high active reg: %d\n", \
1401 highest_active_reg); \
1402 PUSH_FAILURE_INT (highest_active_reg); \
1404 DEBUG_PRINT2 (" Pushing pattern 0x%lx: \n", \
1405 (long) pattern_place); \
1406 DEBUG_PRINT_COMPILED_PATTERN(bufp, pattern_place, pend); \
1407 PUSH_FAILURE_POINTER (pattern_place); \
1409 DEBUG_PRINT2(" Pushing string %p: `", \
1411 DEBUG_PRINT_DOUBLE_STRING(string_place, \
1414 DEBUG_PRINT1("'\n"); \
1415 PUSH_FAILURE_POINTER (string_place); \
1417 DEBUG_PRINT2(" Pushing failure id: %u\n", failure_id); \
1418 DEBUG_PUSH(failure_id); \
1421 /* This is the number of items that are pushed and popped on the stack
1422 for each register. */
1423 #define NUM_REG_ITEMS 3
1425 /* Individual items aside from the registers. */
1427 #define NUM_NONREG_ITEMS 5 /* Includes failure point id. */
1429 #define NUM_NONREG_ITEMS 4
1432 /* We push at most this many items on the stack. */
1433 /* We used to use (num_regs - 1), which is the number of registers
1434 this regexp will save; but that was changed to 5
1435 to avoid stack overflow for a regexp with lots of parens. */
1436 #define MAX_FAILURE_ITEMS (5 * (NUM_REG_ITEMS + NUM_NONREG_ITEMS))
1438 /* We actually push this many items. */
1439 #define NUM_FAILURE_ITEMS \
1440 ((highest_active_reg - lowest_active_reg + 1) * NUM_REG_ITEMS \
1443 /* How many items can still be added to the stack without overflowing it. */
1444 #define REMAINING_AVAIL_SLOTS ((int) ((fail_stack).size - (fail_stack).avail))
1446 /* Pops what PUSH_FAIL_STACK pushes.
1448 We restore into the parameters, all of which should be lvalues:
1449 STR -- the saved data position.
1450 PAT -- the saved pattern position.
1451 LOW_REG, HIGH_REG -- the highest and lowest active registers.
1452 REGSTART, REGEND -- arrays of string positions.
1453 REG_INFO -- array of information about each subexpression.
1455 Also assumes the variables `fail_stack' and (if debugging), `bufp',
1456 `pend', `string1', `size1', `string2', and `size2'. */
1458 #define POP_FAILURE_POINT(str, pat, low_reg, high_reg, \
1459 regstart, regend, reg_info) \
1462 const unsigned char *string_temp; \
1463 DEBUG_STATEMENT (fail_stack_elt_t ffailure_id); \
1465 assert (!FAIL_STACK_EMPTY ()); \
1467 /* Remove failure points and point to \
1468 * how many regs pushed. */ \
1469 DEBUG_PRINT1 ("POP_FAILURE_POINT:\n"); \
1470 DEBUG_PRINT2 (" Before pop, next avail: %lu\n", \
1471 (unsigned long) fail_stack.avail); \
1472 DEBUG_PRINT2 (" size: %lu\n", \
1473 (unsigned long) fail_stack.size); \
1475 assert (fail_stack.avail >= NUM_NONREG_ITEMS); \
1477 DEBUG_POP (&ffailure_id.integer); \
1478 DEBUG_PRINT2 (" Popping failure id: %u\n", \
1479 * (unsigned int *) &ffailure_id); \
1481 /* If the saved string location is NULL, it \
1482 came from an on_failure_keep_string_jump opcode, \
1483 and we want to throw away the saved NULL, thus \
1484 retaining our current position in the string. */ \
1485 string_temp = (const unsigned char *) \
1486 POP_FAILURE_POINTER (); \
1487 if (string_temp != NULL) \
1488 str = string_temp; \
1490 DEBUG_PRINT2(" Popping string %p: `", str); \
1491 DEBUG_PRINT_DOUBLE_STRING(str, \
1494 DEBUG_PRINT1("'\n"); \
1496 pat = (unsigned char *) POP_FAILURE_POINTER(); \
1497 DEBUG_PRINT2 (" Popping pattern %p: ", pat); \
1498 DEBUG_PRINT_COMPILED_PATTERN(bufp, pat, pend); \
1500 /* Restore register info. */ \
1501 high_reg = (unsigned) POP_FAILURE_INT (); \
1502 DEBUG_PRINT2 (" Popping high active reg: %d\n", high_reg); \
1504 low_reg = (unsigned) POP_FAILURE_INT (); \
1505 DEBUG_PRINT2 (" Popping low active reg: %d\n", low_reg); \
1507 for (this_reg = high_reg; this_reg >= low_reg; this_reg--) { \
1508 DEBUG_PRINT2 (" Popping reg: %d\n", this_reg); \
1510 reg_info[this_reg].word = POP_FAILURE_ELT (); \
1511 DEBUG_PRINT2 (" info: %p\n", \
1512 ®_info[this_reg]); \
1514 regend[this_reg] = POP_FAILURE_POINTER (); \
1515 DEBUG_PRINT2 (" end: %p\n", \
1516 regend[this_reg]); \
1518 regstart[this_reg] = POP_FAILURE_POINTER (); \
1519 DEBUG_PRINT2 (" start: %p\n", \
1520 regstart[this_reg]); \
1523 set_regs_matched_done = 0; \
1524 DEBUG_STATEMENT (nfailure_points_popped++); \
1525 } while (0) /* POP_FAILURE_POINT */
1527 /* Structure for per-register (a.k.a. per-group) information.
1528 Other register information, such as the
1529 starting and ending positions (which are addresses), and the list of
1530 inner groups (which is a bits list) are maintained in separate
1533 We are making a (strictly speaking) nonportable assumption here: that
1534 the compiler will pack our bit fields into something that fits into
1535 the type of `word', i.e., is something that fits into one item on the
1539 fail_stack_elt_t word;
1541 /* This field is one if this group can match the empty string,
1542 zero if not. If not yet determined, `MATCH_NULL_UNSET_VALUE'. */
1543 #define MATCH_NULL_UNSET_VALUE 3
1544 unsigned match_null_string_p:2;
1545 unsigned is_active:1;
1546 unsigned matched_something:1;
1547 unsigned ever_matched_something:1;
1549 } register_info_type;
1551 #define REG_MATCH_NULL_STRING_P(R) ((R).bits.match_null_string_p)
1552 #define IS_ACTIVE(R) ((R).bits.is_active)
1553 #define MATCHED_SOMETHING(R) ((R).bits.matched_something)
1554 #define EVER_MATCHED_SOMETHING(R) ((R).bits.ever_matched_something)
1556 /* Call this when have matched a real character; it sets `matched' flags
1557 for the subexpressions which we are currently inside. Also records
1558 that those subexprs have matched. */
1559 #define SET_REGS_MATCHED() \
1562 if (!set_regs_matched_done) \
1565 set_regs_matched_done = 1; \
1566 for (r = lowest_active_reg; r <= highest_active_reg; r++) \
1568 MATCHED_SOMETHING (reg_info[r]) \
1569 = EVER_MATCHED_SOMETHING (reg_info[r]) \
1576 /* Registers are set to a sentinel when they haven't yet matched. */
1577 static unsigned char reg_unset_dummy;
1578 #define REG_UNSET_VALUE (®_unset_dummy)
1579 #define REG_UNSET(e) ((e) == REG_UNSET_VALUE)
1581 /* Subroutine declarations and macros for regex_compile. */
1583 /* Fetch the next character in the uncompiled pattern---translating it
1584 if necessary. Also cast from a signed character in the constant
1585 string passed to us by the user to an unsigned char that we can use
1586 as an array index (in, e.g., `translate'). */
1587 #define PATFETCH(c) \
1590 c = TRANSLATE (c); \
1593 /* Fetch the next character in the uncompiled pattern, with no
1595 #define PATFETCH_RAW(c) \
1596 do {if (p == pend) return REG_EEND; \
1597 assert (p < pend); \
1598 c = charptr_emchar (p); \
1602 /* Go backwards one character in the pattern. */
1603 #define PATUNFETCH DEC_CHARPTR (p)
1607 #define PATFETCH_EXTENDED(emch) \
1608 do {if (p == pend) return REG_EEND; \
1609 assert (p < pend); \
1610 emch = charptr_emchar ((const Bufbyte *) p); \
1612 if (TRANSLATE_P (translate) && emch < 0x80) \
1613 emch = (Emchar) (unsigned char) RE_TRANSLATE (emch); \
1616 #define PATFETCH_RAW_EXTENDED(emch) \
1617 do {if (p == pend) return REG_EEND; \
1618 assert (p < pend); \
1619 emch = charptr_emchar ((const Bufbyte *) p); \
1623 #define PATUNFETCH_EXTENDED DEC_CHARPTR (p)
1625 #define PATFETCH_EITHER(emch) \
1627 if (has_extended_chars) \
1628 PATFETCH_EXTENDED (emch); \
1633 #define PATFETCH_RAW_EITHER(emch) \
1635 if (has_extended_chars) \
1636 PATFETCH_RAW_EXTENDED (emch); \
1638 PATFETCH_RAW (emch); \
1641 #define PATUNFETCH_EITHER \
1643 if (has_extended_chars) \
1644 PATUNFETCH_EXTENDED (emch); \
1646 PATUNFETCH (emch); \
1649 #else /* not MULE */
1651 #define PATFETCH_EITHER(emch) PATFETCH (emch)
1652 #define PATFETCH_RAW_EITHER(emch) PATFETCH_RAW (emch)
1653 #define PATUNFETCH_EITHER PATUNFETCH
1657 /* If `translate' is non-null, return translate[D], else just D. We
1658 cast the subscript to translate because some data is declared as
1659 `char *', to avoid warnings when a string constant is passed. But
1660 when we use a character as a subscript we must make it unsigned. */
1661 #define TRANSLATE(d) (TRANSLATE_P (translate) ? RE_TRANSLATE (d) : (d))
1663 /* Macros for outputting the compiled pattern into `buffer'. */
1665 /* If the buffer isn't allocated when it comes in, use this. */
1666 #define INIT_BUF_SIZE 32
1668 /* Make sure we have at least N more bytes of space in buffer. */
1669 #define GET_BUFFER_SPACE(n) \
1670 while (buf_end - bufp->buffer + (n) > (ptrdiff_t) bufp->allocated) \
1673 /* Make sure we have one more byte of buffer space and then add C to it. */
1674 #define BUF_PUSH(c) \
1676 GET_BUFFER_SPACE (1); \
1677 *buf_end++ = (unsigned char) (c); \
1680 /* Ensure we have two more bytes of buffer space and then append C1 and C2. */
1681 #define BUF_PUSH_2(c1, c2) \
1683 GET_BUFFER_SPACE (2); \
1684 *buf_end++ = (unsigned char) (c1); \
1685 *buf_end++ = (unsigned char) (c2); \
1688 /* As with BUF_PUSH_2, except for three bytes. */
1689 #define BUF_PUSH_3(c1, c2, c3) \
1691 GET_BUFFER_SPACE (3); \
1692 *buf_end++ = (unsigned char) (c1); \
1693 *buf_end++ = (unsigned char) (c2); \
1694 *buf_end++ = (unsigned char) (c3); \
1697 /* Store a jump with opcode OP at LOC to location TO. We store a
1698 relative address offset by the three bytes the jump itself occupies. */
1699 #define STORE_JUMP(op, loc, to) \
1700 store_op1 (op, loc, (to) - (loc) - 3)
1702 /* Likewise, for a two-argument jump. */
1703 #define STORE_JUMP2(op, loc, to, arg) \
1704 store_op2 (op, loc, (to) - (loc) - 3, arg)
1706 /* Like `STORE_JUMP', but for inserting. Assume `buf_end' is the
1708 #define INSERT_JUMP(op, loc, to) \
1709 insert_op1 (op, loc, (to) - (loc) - 3, buf_end)
1711 /* Like `STORE_JUMP2', but for inserting. Assume `buf_end' is the
1713 #define INSERT_JUMP2(op, loc, to, arg) \
1714 insert_op2 (op, loc, (to) - (loc) - 3, arg, buf_end)
1716 /* This is not an arbitrary limit: the arguments which represent offsets
1717 into the pattern are two bytes long. So if 2^16 bytes turns out to
1718 be too small, many things would have to change. */
1719 #define MAX_BUF_SIZE (1L << 16)
1721 /* Extend the buffer by twice its current size via realloc and
1722 reset the pointers that pointed into the old block to point to the
1723 correct places in the new one. If extending the buffer results in it
1724 being larger than MAX_BUF_SIZE, then flag memory exhausted. */
1725 #define EXTEND_BUFFER() \
1727 re_char *old_buffer = bufp->buffer; \
1728 if (bufp->allocated == MAX_BUF_SIZE) \
1730 bufp->allocated <<= 1; \
1731 if (bufp->allocated > MAX_BUF_SIZE) \
1732 bufp->allocated = MAX_BUF_SIZE; \
1733 bufp->buffer = (unsigned char *)xrealloc( \
1734 bufp->buffer, bufp->allocated); \
1735 if (bufp->buffer == NULL) { \
1736 return REG_ESPACE; \
1738 /* If the buffer moved, move all the pointers into it. */ \
1739 if (old_buffer != bufp->buffer) { \
1740 buf_end = (buf_end - old_buffer) + bufp->buffer; \
1741 begalt = (begalt - old_buffer) + bufp->buffer; \
1742 if (fixup_alt_jump) { \
1744 (fixup_alt_jump - old_buffer) + \
1748 laststart = (laststart - old_buffer) + \
1751 if (pending_exact) { \
1753 (pending_exact - old_buffer) + \
1759 /* Since we have one byte reserved for the register number argument to
1760 {start,stop}_memory, the maximum number of groups we can report
1761 things about is what fits in that byte. */
1762 #define MAX_REGNUM 255
1764 /* But patterns can have more than `MAX_REGNUM' registers. We just
1765 ignore the excess. */
1766 typedef unsigned regnum_t;
1768 #define INIT_REG_TRANSLATE_SIZE 5
1770 /* Macros for the compile stack. */
1772 /* Since offsets can go either forwards or backwards, this type needs to
1773 be able to hold values from -(MAX_BUF_SIZE - 1) to MAX_BUF_SIZE - 1. */
1774 typedef int pattern_offset_t;
1777 pattern_offset_t begalt_offset;
1778 pattern_offset_t fixup_alt_jump;
1779 pattern_offset_t inner_group_offset;
1780 pattern_offset_t laststart_offset;
1782 } compile_stack_elt_t;
1785 compile_stack_elt_t *stack;
1787 unsigned avail; /* Offset of next open position. */
1788 } compile_stack_type;
1790 #define INIT_COMPILE_STACK_SIZE 32
1792 #define COMPILE_STACK_EMPTY (compile_stack.avail == 0)
1793 #define COMPILE_STACK_FULL (compile_stack.avail == compile_stack.size)
1795 /* The next available element. */
1796 #define COMPILE_STACK_TOP (compile_stack.stack[compile_stack.avail])
1798 /* Set the bit for character C in a bit vector. */
1799 #define SET_LIST_BIT(c) \
1800 (buf_end[((unsigned char) (c)) / BYTEWIDTH] \
1801 |= 1 << (((unsigned char) c) % BYTEWIDTH))
1805 /* Set the "bit" for character C in a range table. */
1806 #define SET_RANGETAB_BIT(c) put_range_table (rtab, c, c, Qt)
1808 /* Set the "bit" for character c in the appropriate table. */
1809 #define SET_EITHER_BIT(c) \
1811 if (has_extended_chars) \
1812 SET_RANGETAB_BIT (c); \
1817 #else /* not MULE */
1819 #define SET_EITHER_BIT(c) SET_LIST_BIT (c)
1823 /* Get the next unsigned number in the uncompiled pattern. */
1824 #define GET_UNSIGNED_NUMBER(num) \
1828 while (ISDIGIT (c)) \
1832 num = num * 10 + c - '0'; \
1840 #define CHAR_CLASS_MAX_LENGTH 9 /* Namely, `multibyte'. */
1843 static void store_op1(re_opcode_t op, unsigned char *loc, int arg);
1844 static void store_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2);
1845 static void insert_op1(re_opcode_t op, unsigned char *loc, int arg,
1846 unsigned char *end);
1847 static void insert_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2,
1848 unsigned char *end);
1849 static re_bool at_begline_loc_p(re_char*, re_char*, reg_syntax_t);
1850 static re_bool at_endline_loc_p(re_char*, re_char*, int syntax);
1851 static re_bool group_in_compile_stack(compile_stack_type compile_stack,
1853 static reg_errcode_t compile_range(re_char ** p_ptr, re_char * pend,
1854 RE_TRANSLATE_TYPE translate,
1855 reg_syntax_t syntax, unsigned char *b);
1857 static reg_errcode_t compile_extended_range(re_char ** p_ptr,
1859 RE_TRANSLATE_TYPE translate,
1860 reg_syntax_t syntax,
1863 static re_bool group_match_null_string_p(unsigned char **p,
1865 register_info_type * reg_info);
1866 static re_bool alt_match_null_string_p(unsigned char *p, unsigned char *end,
1867 register_info_type * reg_info);
1868 static re_bool common_op_match_null_string_p(unsigned char **p,
1870 register_info_type * reg_info);
1871 static int bcmp_translate(const unsigned char *s1, const unsigned char *s2,
1872 REGISTER int len, RE_TRANSLATE_TYPE translate);
1873 static int re_match_2_internal(struct re_pattern_buffer *bufp,
1874 re_char * string1, int size1,
1875 re_char * string2, int size2, int pos,
1876 struct re_registers *regs, int stop);
1878 #ifndef MATCH_MAY_ALLOCATE
1880 /* If we cannot allocate large objects within re_match_2_internal,
1881 we make the fail stack and register vectors global.
1882 The fail stack, we grow to the maximum size when a regexp
1884 The register vectors, we adjust in size each time we
1885 compile a regexp, according to the number of registers it needs. */
1887 static fail_stack_type fail_stack;
1889 /* Size with which the following vectors are currently allocated.
1890 That is so we can make them bigger as needed,
1891 but never make them smaller. */
1892 static int regs_allocated_size;
1894 static re_char **regstart, **regend;
1895 static re_char **old_regstart, **old_regend;
1896 static re_char **best_regstart, **best_regend;
1897 static register_info_type *reg_info;
1898 static re_char **reg_dummy;
1899 static register_info_type *reg_info_dummy;
1901 /* Make the register vectors big enough for NUM_REGS registers,
1902 but don't make them smaller. */
1904 static void regex_grow_registers(int num_regs)
1906 if (num_regs > regs_allocated_size) {
1907 RETALLOC_IF(regstart, num_regs, re_char *);
1908 RETALLOC_IF(regend, num_regs, re_char *);
1909 RETALLOC_IF(old_regstart, num_regs, re_char *);
1910 RETALLOC_IF(old_regend, num_regs, re_char *);
1911 RETALLOC_IF(best_regstart, num_regs, re_char *);
1912 RETALLOC_IF(best_regend, num_regs, re_char *);
1913 RETALLOC_IF(reg_info, num_regs, register_info_type);
1914 RETALLOC_IF(reg_dummy, num_regs, re_char *);
1915 RETALLOC_IF(reg_info_dummy, num_regs, register_info_type);
1917 regs_allocated_size = num_regs;
1921 #endif /* not MATCH_MAY_ALLOCATE */
1923 /* auxiliary stuff, FSF theft */
1924 /* Character classes. */
1927 RECC_ALNUM, RECC_ALPHA, RECC_WORD,
1928 RECC_GRAPH, RECC_PRINT,
1929 RECC_LOWER, RECC_UPPER,
1930 RECC_PUNCT, RECC_CNTRL,
1931 RECC_DIGIT, RECC_XDIGIT,
1932 RECC_BLANK, RECC_SPACE,
1933 RECC_MULTIBYTE, RECC_NONASCII,
1934 RECC_ASCII, RECC_UNIBYTE
1937 /* Map a string to the char class it names (if any). */
1938 static inline re_wctype_t
1939 re_wctype(re_char *str)
1941 const char *string = (const char*)str;
1943 if (STREQ (string, "alnum")) {
1945 } else if (STREQ (string, "alpha")) {
1947 } else if (STREQ (string, "digit")) {
1949 } else if (STREQ (string, "graph")) {
1951 } else if (STREQ (string, "space")) {
1953 } else if (STREQ (string, "upper")) {
1955 } else if (STREQ (string, "lower")) {
1957 } else if (STREQ (string, "word")) {
1959 } else if (STREQ (string, "print")) {
1961 } else if (STREQ (string, "punct")) {
1963 } else if (STREQ (string, "xdigit")) {
1965 } else if (STREQ (string, "cntrl")) {
1967 } else if (STREQ (string, "blank")) {
1969 } else if (STREQ (string, "ascii")) {
1971 } else if (STREQ (string, "nonascii")) {
1972 return RECC_NONASCII;
1973 } else if (STREQ (string, "unibyte")) {
1974 return RECC_UNIBYTE;
1975 } else if (STREQ (string, "multibyte")) {
1976 return RECC_MULTIBYTE;
1982 /* True if CH is in the char class CC. */
1984 re_iswctype(int ch, re_wctype_t cc)
1988 return ISALNUM (ch);
1990 return ISALPHA (ch);
1992 return ISBLANK (ch);
1994 return ISCNTRL (ch);
1996 return ISDIGIT (ch);
1998 return ISGRAPH (ch);
2000 return ISLOWER (ch);
2002 return ISPRINT (ch);
2004 return ISPUNCT (ch);
2006 return ISSPACE (ch);
2008 return ISUPPER (ch);
2010 return ISXDIGIT (ch);
2012 return IS_REAL_ASCII (ch);
2014 return !IS_REAL_ASCII (ch);
2018 return ISUNIBYTE((unsigned int)ch);
2019 case RECC_MULTIBYTE:
2020 return !ISUNIBYTE((unsigned int)ch);
2030 /* `regex_compile' compiles PATTERN (of length SIZE) according to SYNTAX.
2031 Returns one of error codes defined in `regex.h', or zero for success.
2033 Assumes the `allocated' (and perhaps `buffer') and `translate'
2034 fields are set in BUFP on entry.
2036 If it succeeds, results are put in BUFP (if it returns an error, the
2037 contents of BUFP are undefined):
2038 `buffer' is the compiled pattern;
2039 `syntax' is set to SYNTAX;
2040 `used' is set to the length of the compiled pattern;
2041 `fastmap_accurate' is zero;
2042 `re_ngroups' is the number of groups/subexpressions (including shy
2044 `re_nsub' is the number of non-shy groups in PATTERN;
2045 `not_bol' and `not_eol' are zero;
2047 The `fastmap' and `newline_anchor' fields are neither
2048 examined nor set. */
2050 static reg_errcode_t
2051 regex_compile(re_char * pattern, int size, reg_syntax_t syntax,
2052 struct re_pattern_buffer *bufp)
2054 /* We fetch characters from PATTERN here. We declare these as int
2055 (or possibly long) so that chars above 127 can be used as
2056 array indices. The macros that fetch a character from the pattern
2057 make sure to coerce to unsigned char before assigning, so we won't
2058 get bitten by negative numbers here. */
2059 /* XEmacs change: used to be unsigned char. */
2060 REGISTER EMACS_INT c, c1;
2062 /* A random temporary spot in PATTERN. */
2065 /* Points to the end of the buffer, where we should append. */
2066 REGISTER unsigned char *buf_end;
2068 /* Keeps track of unclosed groups. */
2069 compile_stack_type compile_stack;
2071 /* Points to the current (ending) position in the pattern. */
2072 re_char *p = pattern;
2073 re_char *pend = pattern + size;
2075 /* How to translate the characters in the pattern. */
2076 RE_TRANSLATE_TYPE translate = bufp->translate;
2078 /* Address of the count-byte of the most recently inserted `exactn'
2079 command. This makes it possible to tell if a new exact-match
2080 character can be added to that command or if the character requires
2081 a new `exactn' command. */
2082 unsigned char *pending_exact = 0;
2084 /* Address of start of the most recently finished expression.
2085 This tells, e.g., postfix * where to find the start of its
2086 operand. Reset at the beginning of groups and alternatives. */
2087 unsigned char *laststart = 0;
2089 /* Address of beginning of regexp, or inside of last group. */
2090 unsigned char *begalt;
2092 /* Place in the uncompiled pattern (i.e., the {) to
2093 which to go back if the interval is invalid. */
2094 re_char *beg_interval;
2096 /* Address of the place where a forward jump should go to the end of
2097 the containing expression. Each alternative of an `or' -- except the
2098 last -- ends with a forward jump of this sort. */
2099 unsigned char *fixup_alt_jump = 0;
2101 /* Counts open-groups as they are encountered. Remembered for the
2102 matching close-group on the compile stack, so the same register
2103 number is put in the stop_memory as the start_memory. */
2104 regnum_t regnum = 0;
2106 /* we need to close over compile_stack */
2107 # define free_stack(args...) xfree(compile_stack.stack)
2109 #ifdef REGEX_DEBUG_FLAG
2110 DEBUG_PRINT1("\nCompiling pattern: ");
2114 for (debug_count = 0; debug_count < size; debug_count++)
2115 putchar(pattern[debug_count]);
2120 /* Initialize the compile stack. */
2121 compile_stack.stack =
2122 TALLOC(INIT_COMPILE_STACK_SIZE, compile_stack_elt_t);
2123 if (compile_stack.stack == NULL) {
2126 compile_stack.size = INIT_COMPILE_STACK_SIZE;
2127 compile_stack.avail = 0;
2129 /* Initialize the pattern buffer. */
2130 bufp->syntax = syntax;
2131 bufp->fastmap_accurate = 0;
2132 bufp->not_bol = bufp->not_eol = 0;
2134 /* Set `used' to zero, so that if we return an error, the pattern
2135 printer (for debugging) will think there's no pattern. We reset it
2139 /* Always count groups, whether or not bufp->no_sub is set. */
2141 bufp->re_ngroups = 0;
2143 /* Allocate index translation array if needed. */
2144 if (bufp->external_to_internal_register == 0) {
2145 bufp->external_to_internal_register_size =
2146 INIT_REG_TRANSLATE_SIZE;
2147 RETALLOC(bufp->external_to_internal_register,
2148 bufp->external_to_internal_register_size, int);
2150 /* Initialize translations to impossible value to aid debugging. */
2154 bufp->external_to_internal_register[0] = 0;
2155 for (i = 1; i < bufp->external_to_internal_register_size; i++)
2156 bufp->external_to_internal_register[i] =
2160 #if !defined (emacs) && !defined (SYNTAX_TABLE)
2161 /* Initialize the syntax table. */
2165 if (bufp->allocated == 0) {
2167 /* If zero allocated, but buffer is non-null, try to
2168 realloc enough space. This loses if buffer's address
2169 is bogus, but that is the user's responsibility. */
2170 RETALLOC(bufp->buffer, INIT_BUF_SIZE, unsigned char);
2172 /* Caller did not allocate a buffer. Do it for them. */
2173 bufp->buffer = TALLOC(INIT_BUF_SIZE, unsigned char);
2175 if (!bufp->buffer) {
2180 bufp->allocated = INIT_BUF_SIZE;
2183 begalt = buf_end = bufp->buffer;
2185 /* Loop through the uncompiled pattern until we're at the end. */
2191 if ( /* If at start of pattern, it's an operator. */
2193 /* If context independent, it's an operator. */
2194 || syntax & RE_CONTEXT_INDEP_ANCHORS
2195 /* Otherwise, depends on what's come before. */
2196 || at_begline_loc_p(pattern, p, syntax))
2204 if ( /* If at end of pattern, it's an
2207 /* If context independent, it's an
2209 || syntax & RE_CONTEXT_INDEP_ANCHORS
2210 /* Otherwise, depends on what's
2212 || at_endline_loc_p(p, pend, syntax))
2221 if ((syntax & RE_BK_PLUS_QM) ||
2222 (syntax & RE_LIMITED_OPS)) {
2228 re_bool zero_times_ok;
2229 re_bool many_times_ok;
2232 /* If there is no previous pattern... */
2234 if (syntax & RE_CONTEXT_INVALID_OPS) {
2237 } else if (!(syntax & RE_CONTEXT_INDEP_OPS)) {
2242 /* true means zero/many matches are allowed. */
2243 zero_times_ok = c != '+';
2244 many_times_ok = c != '?';
2246 /* true means match shortest string possible. */
2249 /* If there is a sequence of repetition chars, collapse it
2250 down to just one (the right one). We can't combine
2251 interval operators with these because of, e.g., `a{2}*',
2252 which should only match an even number of `a's. */
2257 || (!(syntax & RE_BK_PLUS_QM)
2258 && (c == '+' || c == '?'))) ;
2260 else if (syntax & RE_BK_PLUS_QM
2268 if (!(c1 == '+' || c1 == '?')) {
2280 /* If we get here, we found another repeat
2282 if (!(syntax & RE_NO_MINIMAL_MATCHING)) {
2283 /* "*?" and "+?" and "??" are okay (and
2284 mean match minimally), but other
2285 sequences (such as "*??" and "+++")
2286 are rejected (reserved for future
2288 if (minimal || c != '?') {
2294 zero_times_ok |= c != '+';
2295 many_times_ok |= c != '?';
2299 /* Star, etc. applied to an empty pattern is equivalent
2300 to an empty pattern. */
2305 /* Now we know whether zero matches is allowed
2306 and whether two or more matches is allowed
2307 and whether we want minimal or maximal matching. */
2309 if (!many_times_ok) {
2311 0: /on_failure_jump to 6
2316 GET_BUFFER_SPACE(6);
2317 INSERT_JUMP(jump, laststart,
2320 INSERT_JUMP(on_failure_jump,
2324 } else if (zero_times_ok) {
2328 6: /on_failure_jump to 3
2331 GET_BUFFER_SPACE(6);
2332 INSERT_JUMP(jump, laststart,
2335 STORE_JUMP(on_failure_jump,
2342 3: /on_failure_jump to 0
2345 GET_BUFFER_SPACE(3);
2346 STORE_JUMP(on_failure_jump,
2347 buf_end, laststart);
2351 /* Are we optimizing this jump? */
2352 re_bool keep_string_p = false;
2354 if (many_times_ok) {
2355 /* More than one repetition is allowed,
2356 so put in at the end a backward
2357 relative jump from `buf_end' to
2358 before the next jump we're going to
2359 put in below (which jumps from
2360 laststart to after this jump).
2362 But if we are at the `*' in the exact
2363 sequence `.*\n', insert an
2364 unconditional jump backwards to the
2365 ., instead of the beginning of the
2366 loop. This way we only push a
2367 failure point once, instead of every
2368 time through the loop. */
2369 assert(p - 1 > pattern);
2371 /* Allocate the space for the jump. */
2372 GET_BUFFER_SPACE(3);
2374 /* We know we are not at the first
2375 character of the pattern, because
2376 laststart was nonzero. And we've
2377 already incremented `p', by the way,
2378 to be the character after the `*'.
2379 Do we have to do something analogous
2380 here for null bytes, because of
2382 if (*(p - 2) == '.' &&
2386 !(syntax & RE_DOT_NEWLINE)) {
2391 keep_string_p = true;
2393 /* Anything else. */
2399 /* We've added more stuff to the
2404 /* On failure, jump from laststart to buf_end +
2405 3, which will be the end of the buffer after
2406 this jump is inserted. */
2407 GET_BUFFER_SPACE(3);
2408 INSERT_JUMP(keep_string_p ?
2409 on_failure_keep_string_jump
2411 laststart, buf_end + 3);
2414 if (!zero_times_ok) {
2415 /* At least one repetition is required,
2416 so insert a `dummy_failure_jump'
2417 before the initial `on_failure_jump'
2418 instruction of the loop. This effects
2419 a skip over that instruction the
2420 first time we hit that loop. */
2421 GET_BUFFER_SPACE(3);
2422 INSERT_JUMP(dummy_failure_jump,
2433 laststart = buf_end;
2438 /* XEmacs change: this whole section */
2439 re_bool had_char_class = false;
2441 re_bool has_extended_chars = false;
2442 REGISTER Lisp_Object rtab = Qnil;
2450 /* Ensure that we have enough space to push a charset:
2451 the opcode, the length count, and the bitset; 34
2453 GET_BUFFER_SPACE(34);
2455 laststart = buf_end;
2457 /* We test `*p == '^' twice, instead of using an if
2458 statement, so we only need one BUF_PUSH. */
2459 BUF_PUSH(*p == '^' ? charset_not : charset);
2463 /* Remember the first position in the bracket
2467 /* Push the number of bytes in the bitmap. */
2468 BUF_PUSH((1 << BYTEWIDTH) / BYTEWIDTH);
2470 /* Clear the whole map. */
2471 memset(buf_end, 0, (1 << BYTEWIDTH) / BYTEWIDTH);
2473 /* charset_not matches newline according to a syntax
2475 if ((re_opcode_t) buf_end[-2] == charset_not
2476 && (syntax & RE_HAT_LISTS_NOT_NEWLINE))
2480 start_over_with_extended:
2481 if (has_extended_chars) {
2482 /* There are extended chars here, which means we
2483 need to start over and shift to unified
2484 range-table format. */
2485 if (buf_end[-2] == charset)
2486 buf_end[-2] = charset_mule;
2488 buf_end[-2] = charset_mule_not;
2491 /* go back to the beginning of the charset,
2492 after a possible ^. */
2493 rtab = Vthe_lisp_rangetab;
2494 Fclear_range_table(rtab);
2496 /* charset_not matches newline according to a
2498 if ((re_opcode_t) buf_end[-1] ==
2501 RE_HAT_LISTS_NOT_NEWLINE))
2502 SET_EITHER_BIT('\n');
2506 /* Read in characters and ranges, setting map bits. */
2515 if (c >= 0x80 && !has_extended_chars) {
2516 has_extended_chars = 1;
2517 /* Frumble-bumble, we've found some
2518 extended chars. Need to start over,
2519 process everything using the general
2520 extended-char mechanism, and need to
2521 use charset_mule and charset_mule_not
2522 instead of charset and
2524 goto start_over_with_extended;
2527 /* \ might escape characters inside [...] and
2529 if ((syntax & RE_BACKSLASH_ESCAPE_IN_LISTS) &&
2539 && !has_extended_chars) {
2540 has_extended_chars = 1;
2541 goto start_over_with_extended;
2548 /* Could be the end of the bracket
2549 expression. If it's not (i.e., when
2550 the bracket expression is `[]' so
2551 far), the ']' character bit gets set
2553 if (c == ']' && p != p1 + 1)
2556 /* Look ahead to see if it's a range
2557 when the last thing was a character
2559 if (had_char_class && c == '-'
2565 /* Look ahead to see if it's a range
2566 when the last thing was a character:
2567 if this is a hyphen not at the
2568 beginning or the end of a list, then
2569 it's the range operator. */
2571 && !(p - 2 >= pattern
2573 && !(p - 3 >= pattern
2580 if (*(const unsigned char *)p >= 0x80
2581 && !has_extended_chars) {
2582 has_extended_chars = 1;
2583 goto start_over_with_extended;
2585 if (has_extended_chars)
2586 ret = compile_extended_range
2587 (&p, pend, translate,
2592 (&p, pend, translate,
2594 if (ret != REG_NOERROR) {
2600 else if (p[0] == '-' && p[1] != ']') {
2601 /* This handles ranges made up of
2605 /* Move past the `-'. */
2609 if (*(const unsigned char*)p >= 0x80
2610 && !has_extended_chars) {
2611 has_extended_chars = 1;
2612 goto start_over_with_extended;
2614 if (has_extended_chars)
2615 ret = compile_extended_range(
2616 &p, pend, translate,
2620 ret = compile_range(
2621 &p, pend, translate,
2623 if (ret != REG_NOERROR) {
2629 /* See if we're at the beginning of a possible
2632 else if (syntax & RE_CHAR_CLASSES &&
2633 c == '[' && *p == ':') {
2634 /* Leave room for the null. */
2635 char str[CHAR_CLASS_MAX_LENGTH + 1];
2640 /* If pattern is `[[:'. */
2646 /* #### This code is unused.
2647 Correctness is not checked
2648 after TRT table change. */
2650 if (c == ':' || c == ']'
2653 CHAR_CLASS_MAX_LENGTH)
2655 str[c1++] = (char)c;
2659 /* If isn't a word bracketed by
2660 `[:' and `:]': undo the
2661 ending character, the
2662 letters, and leave the
2663 leading `:' and `[' (but set
2665 if (c == ':' && *p == ']') {
2671 if (wct == RECC_ERROR) {
2676 /* Throw away the ] at
2696 re_iswctype(ch, wct)) {
2700 if (re_iswctype(ch, wct)) {
2705 had_char_class = true;
2711 SET_EITHER_BIT('[');
2712 SET_EITHER_BIT(':');
2713 had_char_class = false;
2716 had_char_class = false;
2722 if (has_extended_chars) {
2723 /* We have a range table, not a bit vector. */
2725 unified_range_table_bytes_needed
2727 GET_BUFFER_SPACE(bytes_needed);
2728 unified_range_table_copy_data(rtab,
2731 unified_range_table_bytes_used
2736 /* Discard any (non)matching list bytes that are
2737 all 0 at the end of the map. Decrease the
2738 map-length byte too. */
2739 while ((int)buf_end[-1] > 0
2740 && buf_end[buf_end[-1] - 1] == 0)
2742 buf_end += buf_end[-1];
2747 if (syntax & RE_NO_BK_PARENS)
2753 if (syntax & RE_NO_BK_PARENS)
2759 if (syntax & RE_NEWLINE_ALT)
2765 if (syntax & RE_NO_BK_VBAR)
2771 if (syntax & RE_INTERVALS && syntax & RE_NO_BK_BRACES)
2772 goto handle_interval;
2782 /* Do not translate the character after the \, so that we can
2783 distinguish, e.g., \B from \b, even if we normally would
2784 translate, e.g., B to b. */
2789 if (syntax & RE_NO_BK_PARENS)
2790 goto normal_backslash;
2797 if (!(syntax & RE_NO_SHY_GROUPS)
2798 && p != pend && *p == '?') {
2819 /* Record the translation from capturing group index to
2820 register number, reallocating table as needed. */
2823 external_to_internal_register_size
2828 external_to_internal_register_size;
2830 external_to_internal_register_size
2833 external_to_internal_register,
2835 external_to_internal_register_size,
2841 external_to_internal_register_size;
2844 external_to_internal_register
2851 external_to_internal_register
2856 if (COMPILE_STACK_FULL) {
2857 RETALLOC(compile_stack.stack,
2860 compile_stack_elt_t);
2861 if (compile_stack.stack == NULL)
2864 compile_stack.size <<= 1;
2867 /* These are the values to restore when
2868 we hit end of this group. They are
2869 all relative offsets, so that if the
2870 whole pattern moves because of
2871 realloc, they will still be
2873 COMPILE_STACK_TOP.begalt_offset =
2874 begalt - bufp->buffer;
2875 COMPILE_STACK_TOP.fixup_alt_jump =
2880 COMPILE_STACK_TOP.laststart_offset =
2881 buf_end - bufp->buffer;
2882 COMPILE_STACK_TOP.regnum = r;
2884 /* We will eventually replace the 0 with
2885 the number of groups inner to this
2886 one. But do not push a start_memory
2887 for groups beyond the last one we can
2888 represent in the compiled pattern.
2889 #### bad bad bad. this will fail in
2890 lots of ways, if we ever have to
2891 backtrack for these groups.
2893 if (r <= MAX_REGNUM) {
2895 inner_group_offset =
2896 buf_end - bufp->buffer +
2898 BUF_PUSH_3(start_memory, r, 0);
2901 compile_stack.avail++;
2906 /* If we've reached MAX_REGNUM groups,
2907 then this open won't actually
2908 generate any code, so we'll have to
2909 clear pending_exact explicitly. */
2915 if (syntax & RE_NO_BK_PARENS)
2916 goto normal_backslash;
2918 if (COMPILE_STACK_EMPTY) {
2920 RE_UNMATCHED_RIGHT_PAREN_ORD) {
2921 goto normal_backslash;
2929 if (fixup_alt_jump) {
2930 /* Push a dummy failure point at the end
2931 of the alternative for a possible
2932 future `pop_failure_jump' to pop.
2933 See comments at `push_dummy_failure'
2935 BUF_PUSH(push_dummy_failure);
2937 /* We allocated space for this jump when
2938 we assigned to `fixup_alt_jump', in
2939 the `handle_alt' case below. */
2940 STORE_JUMP(jump_past_alt,
2941 fixup_alt_jump, buf_end - 1);
2944 /* See similar code for backslashed left paren
2946 if (COMPILE_STACK_EMPTY) {
2948 RE_UNMATCHED_RIGHT_PAREN_ORD) {
2956 /* Since we just checked for an empty stack
2957 above, this ``can't happen''. */
2958 assert(compile_stack.avail != 0);
2960 /* We don't just want to restore into
2961 `regnum', because later groups should
2962 continue to be numbered higher, as in
2963 `(ab)c(de)' -- the second group is
2965 regnum_t this_group_regnum;
2967 compile_stack.avail--;
2970 COMPILE_STACK_TOP.begalt_offset;
2983 COMPILE_STACK_TOP.regnum;
2984 /* If we've reached MAX_REGNUM groups,
2985 then this open won't actually
2986 generate any code, so we'll have to
2987 clear pending_exact explicitly. */
2990 /* We're at the end of the group, so now
2991 we know how many groups were inside
2993 if (this_group_regnum <= MAX_REGNUM) {
2994 unsigned char *inner_group_loc
3003 BUF_PUSH_3(stop_memory,
3011 case '|': /* `\|'. */
3012 if (syntax & RE_LIMITED_OPS
3013 || syntax & RE_NO_BK_VBAR) {
3014 goto normal_backslash;
3017 if (syntax & RE_LIMITED_OPS) {
3021 /* Insert before the previous alternative a jump
3022 which jumps to this alternative if the former
3024 GET_BUFFER_SPACE(3);
3025 INSERT_JUMP(on_failure_jump, begalt,
3030 /* The alternative before this one has a jump after it
3031 which gets executed if it gets matched. Adjust that
3032 jump so it will jump to this alternative's analogous
3033 jump (put in below, which in turn will jump to the next
3034 (if any) alternative's such jump, etc.). The last such
3035 jump jumps to the correct final destination. A picture:
3041 If we are at `b', then fixup_alt_jump right now points to a
3042 three-byte space after `a'. We'll put in the jump, set
3043 fixup_alt_jump to right after `b', and leave behind three
3044 bytes which we'll fill in when we get to after `c'. */
3047 STORE_JUMP(jump_past_alt,
3048 fixup_alt_jump, buf_end);
3050 /* Mark and leave space for a jump after this alternative,
3051 to be filled in later either by next alternative or
3052 when know we're at the end of a series of alternatives. */
3053 fixup_alt_jump = buf_end;
3054 GET_BUFFER_SPACE(3);
3062 /* If \{ is a literal. */
3063 if (!(syntax & RE_INTERVALS)
3064 /* If we're at `\{' and it's not the open-interval
3066 || ((syntax & RE_INTERVALS)
3067 && (syntax & RE_NO_BK_BRACES))
3068 || (p - 2 == pattern && p == pend))
3069 goto normal_backslash;
3073 /* If got here, then the syntax allows
3076 /* At least (most) this many matches
3078 int lower_bound = -1, upper_bound = -1;
3080 beg_interval = p - 1;
3083 if (syntax & RE_NO_BK_BRACES) {
3084 goto unfetch_interval;
3091 GET_UNSIGNED_NUMBER(lower_bound);
3094 GET_UNSIGNED_NUMBER(
3096 if (upper_bound < 0) {
3101 /* Interval such as `{1}' =>
3102 match exactly once. */
3103 upper_bound = lower_bound;
3107 || upper_bound > RE_DUP_MAX
3108 || lower_bound > upper_bound) {
3109 if (syntax & RE_NO_BK_BRACES) {
3110 goto unfetch_interval;
3117 if (!(syntax & RE_NO_BK_BRACES)) {
3126 if (syntax & RE_NO_BK_BRACES) {
3127 goto unfetch_interval;
3134 /* We just parsed a valid interval. */
3136 /* If it's invalid to have no preceding
3140 RE_CONTEXT_INVALID_OPS) {
3145 RE_CONTEXT_INDEP_OPS) {
3146 laststart = buf_end;
3148 goto unfetch_interval;
3152 /* If the upper bound is zero, don't
3153 want to succeed at all; jump from
3154 `laststart' to `b + 3', which will be
3155 the end of the buffer after we insert
3157 if (upper_bound == 0) {
3158 GET_BUFFER_SPACE(3);
3159 INSERT_JUMP(jump, laststart,
3164 /* Otherwise, we have a nontrivial interval. When
3165 we're all done, the pattern will look like:
3166 set_number_at <jump count> <upper bound>
3167 set_number_at <succeed_n count> <lower bound>
3168 succeed_n <after jump addr> <succeed_n count>
3170 jump_n <succeed_n addr> <jump count>
3171 (The upper bound and `jump_n' are omitted if
3172 `upper_bound' is 1, though.) */
3173 else { /* If the upper bound is > 1, we need to insert
3174 more at the end of the loop. */
3175 Memory_count nbytes =
3176 10 + (upper_bound > 1) * 10;
3178 GET_BUFFER_SPACE(nbytes);
3180 /* Initialize lower bound of the `succeed_n', even
3181 though it will be set during matching by its
3182 attendant `set_number_at' (inserted next),
3183 because `re_compile_fastmap' needs to know.
3184 Jump to the `jump_n' we might insert below. */
3185 INSERT_JUMP2(succeed_n,
3193 /* Code to initialize the lower bound. Insert
3194 before the `succeed_n'. The `5' is the last two
3195 bytes of this `set_number_at', plus 3 bytes of
3196 the following `succeed_n'. */
3197 insert_op2(set_number_at,
3203 if (upper_bound > 1) { /* More than one repetition is allowed, so
3204 append a backward jump to the `succeed_n'
3205 that starts this interval.
3207 When we've reached this during matching,
3208 we'll have matched the interval once, so
3209 jump back only `upper_bound - 1' times. */
3218 /* The location we want to set is the second
3219 parameter of the `jump_n'; that is `b-2' as
3220 an absolute address. `laststart' will be
3221 the `set_number_at' we're about to insert;
3222 `laststart+3' the number to set, the source
3223 for the relative address. But we are
3224 inserting into the middle of the pattern --
3225 so everything is getting moved up by 5.
3226 Conclusion: (b - 2) - (laststart + 3) + 5,
3227 i.e., b - laststart.
3229 We insert this at the beginning of the loop
3230 so that if we fail during matching, we'll
3231 reinitialize the bounds. */
3243 beg_interval = NULL;
3248 /* If an invalid interval, match the characters as literals. */
3249 assert(beg_interval);
3251 beg_interval = NULL;
3253 /* normal_char and normal_backslash need `c'. */
3256 if (!(syntax & RE_NO_BK_BRACES)) {
3257 if (p > pattern && p[-1] == '\\')
3258 goto normal_backslash;
3263 /* There is no way to specify the before_dot and after_dot
3264 operators. rms says this is ok. --karl */
3270 laststart = buf_end;
3272 /* XEmacs addition */
3273 if (c >= 0x80 || syntax_spec_code[c] == 0377) {
3277 BUF_PUSH_2(syntaxspec, syntax_spec_code[c]);
3281 laststart = buf_end;
3283 /* XEmacs addition */
3284 if (c >= 0x80 || syntax_spec_code[c] == 0377) {
3288 BUF_PUSH_2(notsyntaxspec, syntax_spec_code[c]);
3292 /* 97.2.17 jhod merged in to XEmacs from mule-2.3 */
3294 laststart = buf_end;
3296 if (c < 32 || c > 127) {
3298 return REG_ECATEGORY;
3300 BUF_PUSH_2(categoryspec, c);
3304 laststart = buf_end;
3306 if (c < 32 || c > 127) {
3308 return REG_ECATEGORY;
3310 BUF_PUSH_2(notcategoryspec, c);
3312 /* end of category patch */
3317 laststart = buf_end;
3322 laststart = buf_end;
3323 BUF_PUSH(notwordchar);
3335 BUF_PUSH(wordbound);
3339 BUF_PUSH(notwordbound);
3361 if (syntax & RE_NO_BK_REFS) {
3364 /* External register indexing. */
3367 if (reg > bufp->re_nsub) {
3372 /* Convert external to internal as soon
3374 reg = bufp->external_to_internal_register[reg];
3376 /* Can't back reference to a
3377 subexpression if inside it. */
3378 if (group_in_compile_stack(
3379 compile_stack, reg)) {
3382 laststart = buf_end;
3383 BUF_PUSH_2(duplicate, reg);
3389 if (syntax & RE_BK_PLUS_QM) {
3392 goto normal_backslash;
3396 /* You might think it would be useful for \ to
3397 mean not to translate; but if we don't
3398 translate it, it will never match
3405 /* Expects the character in `c'. */
3406 /* `p' points to the location after where `c' came
3410 /* XEmacs: modifications here for Mule. */
3411 /* `q' points to the beginning of the next char. */
3414 /* If no exactn currently being built. */
3416 /* If last exactn not at current position. */
3417 || pending_exact + *pending_exact + 1 !=
3419 /* We have only one byte following the exactn for
3421 || ((unsigned int)(*pending_exact + (q - p)) >=
3422 ((unsigned int)(1 << BYTEWIDTH) - 1))
3424 /* If followed by a repetition operator. */
3425 || *q == '*' || *q == '^'
3426 || ((syntax & RE_BK_PLUS_QM)
3427 ? *q == '\\' && (q[1] == '+'
3429 : (*q == '+' || *q == '?'))
3430 || ((syntax & RE_INTERVALS)
3431 && ((syntax & RE_NO_BK_BRACES)
3433 : (q[0] == '\\' && q[1] == '{')))) {
3434 /* Start building a new exactn. */
3436 laststart = buf_end;
3438 BUF_PUSH_2(exactn, 0);
3439 pending_exact = buf_end - 1;
3447 Bufbyte tmp_buf[MAX_EMCHAR_LEN];
3451 set_charptr_emchar(tmp_buf, c);
3453 for (i = 0; i < bt_count; i++) {
3454 BUF_PUSH(tmp_buf[i]);
3462 } /* while p != pend */
3464 /* Through the pattern now. */
3466 if (fixup_alt_jump) {
3467 STORE_JUMP(jump_past_alt, fixup_alt_jump, buf_end);
3469 if (!COMPILE_STACK_EMPTY) {
3473 /* If we don't want backtracking, force success
3474 the first time we reach the end of the compiled pattern. */
3475 if (syntax & RE_NO_POSIX_BACKTRACKING) {
3478 xfree(compile_stack.stack);
3480 /* We have succeeded; set the length of the buffer. */
3481 bufp->used = buf_end - bufp->buffer;
3483 #ifdef REGEX_DEBUG_FLAG
3485 DEBUG_PRINT1("\nCompiled pattern: \n");
3486 print_compiled_pattern(bufp);
3490 #ifndef MATCH_MAY_ALLOCATE
3491 /* Initialize the failure stack to the largest possible stack. This
3492 isn't necessary unless we're trying to avoid calling alloca in
3493 the search and match routines. */
3495 int num_regs = bufp->re_ngroups + 1;
3497 /* Since DOUBLE_FAIL_STACK refuses to double only if the current size
3498 is strictly greater than re_max_failures, the largest possible stack
3499 is 2 * re_max_failures failure points. */
3500 if (fail_stack.size < (2 * re_max_failures * MAX_FAILURE_ITEMS)) {
3502 (2 * re_max_failures * MAX_FAILURE_ITEMS);
3504 if (!fail_stack.stack) {
3506 (fail_stack_elt_t *)
3509 sizeof(fail_stack_elt_t));
3512 (fail_stack_elt_t *)
3513 xrealloc(fail_stack.stack,
3515 sizeof(fail_stack_elt_t)));
3519 regex_grow_registers(num_regs);
3521 #endif /* not MATCH_MAY_ALLOCATE */
3527 /* Subroutines for `regex_compile'. */
3529 /* Store OP at LOC followed by two-byte integer parameter ARG. */
3531 static void store_op1(re_opcode_t op, unsigned char *loc, int arg)
3533 *loc = (unsigned char)op;
3534 STORE_NUMBER(loc + 1, arg);
3537 /* Like `store_op1', but for two two-byte parameters ARG1 and ARG2. */
3539 static void store_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2)
3541 *loc = (unsigned char)op;
3542 STORE_NUMBER(loc + 1, arg1);
3543 STORE_NUMBER(loc + 3, arg2);
3546 /* Copy the bytes from LOC to END to open up three bytes of space at LOC
3547 for OP followed by two-byte integer parameter ARG. */
3550 insert_op1(re_opcode_t op, unsigned char *loc, int arg, unsigned char *end)
3552 REGISTER unsigned char *pfrom = end;
3553 REGISTER unsigned char *pto = end + 3;
3555 while (pfrom != loc)
3558 store_op1(op, loc, arg);
3561 /* Like `insert_op1', but for two two-byte parameters ARG1 and ARG2. */
3564 insert_op2(re_opcode_t op, unsigned char *loc, int arg1, int arg2,
3567 REGISTER unsigned char *pfrom = end;
3568 REGISTER unsigned char *pto = end + 5;
3570 while (pfrom != loc)
3573 store_op2(op, loc, arg1, arg2);
3576 /* P points to just after a ^ in PATTERN. Return true if that ^ comes
3577 after an alternative or a begin-subexpression. We assume there is at
3578 least one character before the ^. */
3581 at_begline_loc_p(re_char *pattern, re_char *p, reg_syntax_t syntax)
3583 re_char *prev = p - 2;
3584 re_bool prev_prev_backslash = prev > pattern && prev[-1] == '\\';
3587 /* After a subexpression? */
3588 (*prev == '(' && (syntax & RE_NO_BK_PARENS || prev_prev_backslash))
3589 /* After an alternative? */
3591 && (syntax & RE_NO_BK_VBAR || prev_prev_backslash));
3594 /* The dual of at_begline_loc_p. This one is for $. We assume there is
3595 at least one character after the $, i.e., `P < PEND'. */
3598 at_endline_loc_p(re_char *p, re_char *pend, int syntax)
3601 re_bool next_backslash = *next == '\\';
3602 re_char *next_next = p + 1 < pend ? p + 1 : 0;
3605 /* Before a subexpression? */
3606 (syntax & RE_NO_BK_PARENS ? *next == ')'
3607 : next_backslash && next_next && *next_next == ')')
3608 /* Before an alternative? */
3609 || (syntax & RE_NO_BK_VBAR ? *next == '|'
3610 : next_backslash && next_next && *next_next == '|');
3613 /* Returns true if REGNUM is in one of COMPILE_STACK's elements and
3614 false if it's not. */
3617 group_in_compile_stack(compile_stack_type compile_stack, regnum_t regnum)
3621 for (this_element = compile_stack.avail - 1;
3622 this_element >= 0; this_element--)
3623 if (compile_stack.stack[this_element].regnum == regnum) {
3629 /* Read the ending character of a range (in a bracket expression) from the
3630 uncompiled pattern *P_PTR (which ends at PEND). We assume the
3631 starting character is in `P[-2]'. (`P[-1]' is the character `-'.)
3632 Then we set the translation of all bits between the starting and
3633 ending characters (inclusive) in the compiled pattern B.
3635 Return an error code.
3637 We use these short variable names so we can use the same macros as
3638 `regex_compile' itself. */
3640 static reg_errcode_t
3641 compile_range(re_char ** p_ptr, re_char * pend, RE_TRANSLATE_TYPE translate,
3642 reg_syntax_t syntax, unsigned char *buf_end)
3644 Element_count this_char;
3646 re_char *p = *p_ptr;
3647 int range_start, range_end;
3652 /* Even though the pattern is a signed `char *', we need to fetch
3653 with unsigned char *'s; if the high bit of the pattern character
3654 is set, the range endpoints will be negative if we fetch using a
3657 We also want to fetch the endpoints without translating them; the
3658 appropriate translation is done in the bit-setting loop below. */
3659 /* The SVR4 compiler on the 3B2 had trouble with unsigned const char *. */
3660 range_start = ((const unsigned char *)p)[-2];
3661 range_end = ((const unsigned char *)p)[0];
3663 /* Have to increment the pointer into the pattern string, so the
3664 caller isn't still at the ending character. */
3667 /* If the start is after the end, the range is empty. */
3668 if (range_start > range_end)
3669 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3671 /* Here we see why `this_char' has to be larger than an `unsigned
3672 char' -- the range is inclusive, so if `range_end' == 0xff
3673 (assuming 8-bit characters), we would otherwise go into an infinite
3674 loop, since all characters <= 0xff. */
3675 for (this_char = range_start; this_char <= range_end; this_char++) {
3676 SET_LIST_BIT(TRANSLATE(this_char));
3684 static reg_errcode_t
3685 compile_extended_range(re_char ** p_ptr, re_char * pend,
3686 RE_TRANSLATE_TYPE translate,
3687 reg_syntax_t syntax, Lisp_Object rtab)
3689 Emchar this_char, range_start, range_end;
3695 p = (const Bufbyte *)*p_ptr;
3696 range_end = charptr_emchar(p);
3697 p--; /* back to '-' */
3698 DEC_CHARPTR(p); /* back to start of range */
3699 /* We also want to fetch the endpoints without translating them; the
3700 appropriate translation is done in the bit-setting loop below. */
3701 range_start = charptr_emchar(p);
3702 INC_CHARPTR(*p_ptr);
3704 /* If the start is after the end, the range is empty. */
3705 if (range_start > range_end)
3706 return syntax & RE_NO_EMPTY_RANGES ? REG_ERANGE : REG_NOERROR;
3708 /* Can't have ranges spanning different charsets, except maybe for
3709 ranges entirely within the first 256 chars. */
3711 if ((range_start >= 0x100 || range_end >= 0x100)
3712 && CHAR_LEADING_BYTE(range_start) != CHAR_LEADING_BYTE(range_end))
3713 return REG_ERANGESPAN;
3715 /* As advertised, translations only work over the 0 - 0x7F range.
3716 Making this kind of stuff work generally is much harder.
3717 Iterating over the whole range like this would be way efficient
3718 if the range encompasses 10,000 chars or something. You'd have
3719 to do something like this:
3723 map over translation table in [range_start, range_end] of
3724 (put the mapped range in a;
3725 put the translation in b)
3726 invert the range in a and truncate to [range_start, range_end]
3727 compute the union of a, b
3728 union the result into rtab
3730 for (this_char = range_start;
3731 this_char <= range_end && this_char < 0x80; this_char++) {
3732 SET_RANGETAB_BIT(TRANSLATE(this_char));
3735 if (this_char <= range_end)
3736 put_range_table(rtab, this_char, range_end, Qt);
3743 /* re_compile_fastmap computes a ``fastmap'' for the compiled pattern in
3744 BUFP. A fastmap records which of the (1 << BYTEWIDTH) possible
3745 characters can start a string that matches the pattern. This fastmap
3746 is used by re_search to skip quickly over impossible starting points.
3748 The caller must supply the address of a (1 << BYTEWIDTH)-byte data
3749 area as BUFP->fastmap.
3751 We set the `fastmap', `fastmap_accurate', and `can_be_null' fields in
3754 Returns 0 if we succeed, -2 if an internal error. */
3756 int re_compile_fastmap(struct re_pattern_buffer *bufp)
3759 #ifdef MATCH_MAY_ALLOCATE
3760 fail_stack_type fail_stack;
3762 DECLARE_DESTINATION;
3763 /* We don't push any register information onto the failure stack. */
3765 REGISTER char *fastmap = bufp->fastmap;
3766 unsigned char *pattern = bufp->buffer;
3767 unsigned long size = bufp->used;
3768 /* actually p supposed to carry the const qualifier, however, some
3769 silly mule code below CHANGES p and hence we cant go with the const
3770 qualifier here, leaving an `unfixable' warning on the way */
3772 unsigned char *p = pattern;
3774 re_char *p = pattern;
3776 REGISTER unsigned char *pend = pattern + size;
3779 /* This holds the pointer to the failure stack, when
3780 it is allocated relocatably. */
3781 fail_stack_elt_t *failure_stack_ptr;
3784 /* Assume that each path through the pattern can be null until
3785 proven otherwise. We set this false at the bottom of switch
3786 statement, to which we get only if a particular path doesn't
3787 match the empty string. */
3788 re_bool path_can_be_null = true;
3790 /* We aren't doing a `succeed_n' to begin with. */
3791 re_bool succeed_n_p = false;
3793 assert(fastmap != NULL && p != NULL);
3796 memset(fastmap, 0, 1 << BYTEWIDTH); /* Assume nothing's valid. */
3797 bufp->fastmap_accurate = 1; /* It will be when we're done. */
3798 bufp->can_be_null = 0;
3801 if (p == pend || *p == succeed) {
3802 /* We have reached the (effective) end of pattern. */
3803 if (!FAIL_STACK_EMPTY()) {
3804 bufp->can_be_null |= path_can_be_null;
3806 /* Reset for next path. */
3807 path_can_be_null = true;
3809 /* fuck, p isnt const thanks to that
3810 * unified range table function below */
3812 p = (unsigned char*)fail_stack.
3813 stack[--fail_stack.avail].pointer;
3815 p = fail_stack.stack[--fail_stack.avail]
3825 /* We should never be about to go beyond the end of the pattern. */
3828 switch (SWITCH_ENUM_CAST((re_opcode_t) * p++)) {
3830 /* I guess the idea here is to simply not bother with a
3831 fastmap if a backreference is used, since it's too
3832 hard to figure out the fastmap for the corresponding
3833 group. Setting `can_be_null' stops `re_search_2'
3834 from using the fastmap, so that is all we do. */
3836 bufp->can_be_null = 1;
3839 /* Following are the cases which match a character.
3840 These end with `break'. */
3847 /* XEmacs: Under Mule, these bit vectors will
3848 only contain values for characters below 0x80. */
3849 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3850 if (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH)))
3855 /* Chars beyond end of map must be allowed. */
3857 for (j = *p * BYTEWIDTH; j < 0x80; j++)
3859 /* And all extended characters must be allowed, too. */
3860 for (j = 0x80; j < 0xA0; j++)
3862 #else /* not MULE */
3863 for (j = *p * BYTEWIDTH; j < (1 << BYTEWIDTH); j++)
3867 for (j = *p++ * BYTEWIDTH - 1; j >= 0; j--)
3869 (p[j / BYTEWIDTH] & (1 << (j % BYTEWIDTH))))
3874 case charset_mule: {
3878 nentries = unified_range_table_nentries(p);
3879 for (i = 0; i < nentries; i++) {
3880 EMACS_INT first, last;
3881 Lisp_Object dummy_val;
3883 Bufbyte strr[MAX_EMCHAR_LEN];
3885 unified_range_table_get_range(p, i,
3890 jj <= last && jj < 0x80; jj++)
3892 /* Ranges below 0x100 can span charsets, but there
3893 are only two (Control-1 and Latin-1), and
3894 either first or last has to be in them. */
3895 set_charptr_emchar(strr, first);
3898 set_charptr_emchar(strr, last);
3905 case charset_mule_not: {
3909 nentries = unified_range_table_nentries(p);
3910 for (i = 0; i < nentries; i++) {
3911 EMACS_INT first, last;
3912 Lisp_Object dummy_val;
3914 int smallest_prev = 0;
3916 unified_range_table_get_range(p, i,
3920 for (jj = smallest_prev;
3921 jj < first && jj < 0x80; jj++)
3923 smallest_prev = last + 1;
3924 if (smallest_prev >= 0x80)
3927 /* Calculating which leading bytes are actually allowed
3928 here is rather difficult, so we just punt and allow
3930 for (i = 0x80; i < 0xA0; i++)
3941 for (j = 0; j < (1 << BYTEWIDTH); j++)
3944 (regex_emacs_buffer->mirror_syntax_table),
3953 goto matchnotsyntax;
3955 for (j = 0; j < (1 << BYTEWIDTH); j++)
3958 (regex_emacs_buffer->mirror_syntax_table),
3965 int fastmap_newline = fastmap['\n'];
3967 /* `.' matches anything ... */
3969 /* "anything" only includes bytes that can be the
3970 first byte of a character. */
3971 for (j = 0; j < 0xA0; j++)
3974 for (j = 0; j < (1 << BYTEWIDTH); j++)
3978 /* ... except perhaps newline. */
3979 if (!(bufp->syntax & RE_DOT_NEWLINE))
3980 fastmap['\n'] = fastmap_newline;
3982 /* Return if we have already set `can_be_null'; if we
3983 have, then the fastmap is irrelevant. Something's
3985 else if (bufp->can_be_null)
3988 /* Otherwise, have to check alternative paths. */
3999 /* This match depends on text properties. These end with
4000 aborting optimizations. */
4001 bufp->can_be_null = 1;
4007 for (j = 0; j < 0x80; j++)
4010 (regex_emacs_buffer->mirror_syntax_table),
4011 j) == (enum syntaxcode)k)
4013 for (j = 0x80; j < 0xA0; j++) {
4014 if (LEADING_BYTE_PREFIX_P(j))
4015 /* too complicated to calculate this right */
4021 cset = CHARSET_BY_LEADING_BYTE(j);
4022 if (CHARSETP(cset)) {
4024 (regex_emacs_buffer, cset,
4026 == Sword || multi_p)
4031 #else /* not MULE */
4032 for (j = 0; j < (1 << BYTEWIDTH); j++)
4035 (regex_emacs_buffer->mirror_syntax_table),
4036 j) == (enum syntaxcode)k)
4043 for (j = 0; j < 0x80; j++)
4046 (regex_emacs_buffer->mirror_syntax_table),
4047 j) != (enum syntaxcode)k)
4049 for (j = 0x80; j < 0xA0; j++) {
4050 if (LEADING_BYTE_PREFIX_P(j))
4051 /* too complicated to calculate this right */
4057 cset = CHARSET_BY_LEADING_BYTE(j);
4058 if (CHARSETP(cset)) {
4060 (regex_emacs_buffer, cset,
4062 != Sword || multi_p)
4067 #else /* not MULE */
4068 for (j = 0; j < (1 << BYTEWIDTH); j++)
4071 (regex_emacs_buffer->mirror_syntax_table),
4072 j) != (enum syntaxcode)k)
4079 /* 97/2/17 jhod category patch */
4081 case notcategoryspec:
4082 bufp->can_be_null = 1;
4084 /* end if category patch */
4087 /* All cases after this match the empty string. These
4088 end with `continue'. */
4107 case push_dummy_failure:
4111 case pop_failure_jump:
4112 case maybe_pop_jump:
4115 case dummy_failure_jump:
4116 EXTRACT_NUMBER_AND_INCR(j, p);
4121 /* Jump backward implies we just went through the body
4122 of a loop and matched nothing. Opcode jumped to
4123 should be `on_failure_jump' or `succeed_n'. Just
4124 treat it like an ordinary jump. For a * loop, it has
4125 pushed its failure point already; if so, discard that
4127 if ((re_opcode_t) * p != on_failure_jump
4128 && (re_opcode_t) * p != succeed_n)
4132 EXTRACT_NUMBER_AND_INCR(j, p);
4135 /* If what's on the stack is where we are now, pop
4137 if (!FAIL_STACK_EMPTY()
4138 && fail_stack.stack[fail_stack.avail - 1].pointer ==
4144 case on_failure_jump:
4145 case on_failure_keep_string_jump:
4146 handle_on_failure_jump:
4147 EXTRACT_NUMBER_AND_INCR(j, p);
4149 /* For some patterns, e.g., `(a?)?', `p+j' here points
4150 to the end of the pattern. We don't want to push
4151 such a point, since when we restore it above,
4152 entering the switch will increment `p' past the end
4153 of the pattern. We don't need to push such a point
4154 since we obviously won't find any more fastmap
4155 entries beyond `pend'. Such a pattern can match the
4156 null string, though. */
4158 if (!PUSH_PATTERN_OP(p + j, fail_stack)) {
4163 bufp->can_be_null = 1;
4166 EXTRACT_NUMBER_AND_INCR(k, p); /* Skip the n. */
4167 succeed_n_p = false;
4173 /* Get to the number of times to succeed. */
4176 /* Increment p past the n for when k != 0. */
4177 EXTRACT_NUMBER_AND_INCR(k, p);
4180 succeed_n_p = true; /* Spaghetti code alert. */
4181 goto handle_on_failure_jump;
4196 abort(); /* We have listed all the cases. */
4199 /* Getting here means we have found the possible starting
4200 characters for one path of the pattern -- and that the empty
4201 string does not match. We need not follow this path further.
4202 Instead, look at the next alternative (remembered on the
4203 stack), or quit if no more. The test at the top of the loop
4204 does these things. */
4205 path_can_be_null = false;
4209 /* Set `can_be_null' for the last path (also the first path, if the
4210 pattern is empty). */
4211 bufp->can_be_null |= path_can_be_null;
4216 } /* re_compile_fastmap */
4218 /* Set REGS to hold NUM_REGS registers, storing them in STARTS and
4219 ENDS. Subsequent matches using PATTERN_BUFFER and REGS will use
4220 this memory for recording register information. STARTS and ENDS
4221 must be allocated using the malloc library routine, and must each
4222 be at least NUM_REGS * sizeof (regoff_t) bytes long.
4224 If NUM_REGS == 0, then subsequent matches should allocate their own
4227 Unless this function is called, the first search or match using
4228 PATTERN_BUFFER will allocate its own register data, without
4229 freeing the old data. */
4232 re_set_registers(struct re_pattern_buffer *bufp, struct re_registers *regs,
4233 unsigned num_regs, regoff_t * starts, regoff_t * ends)
4236 bufp->regs_allocated = REGS_REALLOCATE;
4237 regs->num_regs = num_regs;
4238 regs->start = starts;
4241 bufp->regs_allocated = REGS_UNALLOCATED;
4243 regs->start = regs->end = (regoff_t *) 0;
4247 /* Searching routines. */
4249 /* Like re_search_2, below, but only one string is specified, and
4250 doesn't let you say where to stop matching. */
4253 re_search(struct re_pattern_buffer *bufp, const char *string, int size,
4254 int startpos, int range, struct re_registers *regs)
4256 return re_search_2(bufp, NULL, 0, string, size, startpos, range,
4261 /* Snarfed from src/lisp.h, needed for compiling [ce]tags. */
4262 # define bytecount_to_charcount(ptr, len) (len)
4263 # define charcount_to_bytecount(ptr, len) (len)
4264 typedef int Charcount;
4267 /* Using the compiled pattern in BUFP->buffer, first tries to match the
4268 virtual concatenation of STRING1 and STRING2, starting first at index
4269 STARTPOS, then at STARTPOS + 1, and so on.
4271 With MULE, STARTPOS is a byte position, not a char position. And the
4272 search will increment STARTPOS by the width of the current leading
4275 STRING1 and STRING2 have length SIZE1 and SIZE2, respectively.
4277 RANGE is how far to scan while trying to match. RANGE = 0 means try
4278 only at STARTPOS; in general, the last start tried is STARTPOS +
4281 With MULE, RANGE is a byte position, not a char position. The last
4282 start tried is the character starting <= STARTPOS + RANGE.
4284 In REGS, return the indices of the virtual concatenation of STRING1
4285 and STRING2 that matched the entire BUFP->buffer and its contained
4288 Do not consider matching one past the index STOP in the virtual
4289 concatenation of STRING1 and STRING2.
4291 We return either the position in the strings at which the match was
4292 found, -1 if no match, or -2 if error (such as failure
4296 re_search_2(struct re_pattern_buffer *bufp, const char *str1,
4297 int size1, const char *str2, int size2, int startpos,
4298 int range, struct re_registers *regs, int stop)
4301 re_char *string1 = (re_char *) str1;
4302 re_char *string2 = (re_char *) str2;
4303 REGISTER char *fastmap = bufp->fastmap;
4304 REGISTER RE_TRANSLATE_TYPE translate = bufp->translate;
4305 int total_size = size1 + size2;
4306 int endpos = startpos + range;
4307 #ifdef REGEX_BEGLINE_CHECK
4308 int anchored_at_begline = 0;
4313 /* Check for out-of-range STARTPOS. */
4314 if (startpos < 0 || startpos > total_size)
4317 /* Fix up RANGE if it might eventually take us outside
4318 the virtual concatenation of STRING1 and STRING2. */
4320 range = 0 - startpos;
4321 else if (endpos > total_size)
4322 range = total_size - startpos;
4324 /* If the search isn't to be a backwards one, don't waste time in a
4325 search for a pattern that must be anchored. */
4326 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == begbuf
4331 d = ((const unsigned char *)
4333 size1 ? string2 - size1 : string1) + startpos);
4334 range = charcount_to_bytecount(d, 1);
4338 /* In a forward search for something that starts with \=.
4339 don't keep searching past point. */
4340 if (bufp->used > 0 && (re_opcode_t) bufp->buffer[0] == at_dot
4343 BUF_PT(regex_emacs_buffer) - BUF_BEGV(regex_emacs_buffer)
4350 /* Update the fastmap now if not correct already. */
4351 if (fastmap && !bufp->fastmap_accurate)
4352 if (re_compile_fastmap(bufp) == -2)
4355 #ifdef REGEX_BEGLINE_CHECK
4357 unsigned long i = 0;
4359 while (i < bufp->used) {
4360 if (bufp->buffer[i] == start_memory ||
4361 bufp->buffer[i] == stop_memory)
4366 anchored_at_begline = i < bufp->used
4367 && bufp->buffer[i] == begline;
4372 SETUP_SYNTAX_CACHE_FOR_OBJECT(regex_match_object,
4374 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR
4375 (regex_match_object, regex_emacs_buffer,
4379 /* Loop through the string, looking for a place to start matching. */
4381 #ifdef REGEX_BEGLINE_CHECK
4382 /* If the regex is anchored at the beginning of a line (i.e. with a ^),
4383 then we can speed things up by skipping to the next beginning-of-
4385 if (anchored_at_begline && startpos > 0 && startpos != size1 &&
4387 /* whose stupid idea was it anyway to make this
4388 function take two strings to match?? */
4392 if (startpos < size1 && startpos + range >= size1)
4393 lim = range - (size1 - startpos);
4395 d = ((const unsigned char *)
4397 size1 ? string2 - size1 : string1) + startpos);
4398 DEC_CHARPTR(d); /* Ok, since startpos != size1. */
4399 d_size = charcount_to_bytecount(d, 1);
4401 if (TRANSLATE_P(translate))
4402 while (range > lim && *d != '\n') {
4403 d += d_size; /* Speedier INC_CHARPTR(d) */
4404 d_size = charcount_to_bytecount(d, 1);
4407 while (range > lim && *d != '\n') {
4408 d += d_size; /* Speedier INC_CHARPTR(d) */
4409 d_size = charcount_to_bytecount(d, 1);
4413 startpos += irange - range;
4415 #endif /* REGEX_BEGLINE_CHECK */
4417 /* If a fastmap is supplied, skip quickly over characters that
4418 cannot be the start of a match. If the pattern can match the
4419 null string, however, we don't need to skip characters; we want
4420 the first null string. */
4421 if (fastmap && startpos < total_size && !bufp->can_be_null) {
4422 if (range > 0) { /* Searching forwards. */
4426 if (startpos < size1
4427 && startpos + range >= size1)
4428 lim = range - (size1 - startpos);
4430 d = ((const unsigned char *)
4432 size1 ? string2 - size1 : string1) +
4435 /* Written out as an if-else to avoid testing `translate'
4437 if (TRANSLATE_P(translate))
4438 while (range > lim) {
4441 Bufbyte str[MAX_EMCHAR_LEN];
4443 buf_ch = charptr_emchar(d);
4444 buf_ch = RE_TRANSLATE(buf_ch);
4445 set_charptr_emchar(str, buf_ch);
4447 || fastmap[(unsigned char)
4457 charcount_to_bytecount(d,
4460 d += d_size; /* Speedier INC_CHARPTR(d) */
4462 while (range > lim && !fastmap[*d]) {
4464 charcount_to_bytecount(d,
4467 d += d_size; /* Speedier INC_CHARPTR(d) */
4470 startpos += irange - range;
4471 } else { /* Searching backwards. */
4473 Emchar c = (size1 == 0 || startpos >= size1
4474 ? charptr_emchar(string2 +
4476 : charptr_emchar(string1 +
4480 if (!(c >= 0200 || fastmap[(unsigned char)c]))
4483 if (!fastmap[(unsigned char)c])
4489 /* If can't match the null string, and that's all we have left, fail. */
4490 if (range >= 0 && startpos == total_size && fastmap
4491 && !bufp->can_be_null)
4494 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4495 if (!no_quit_in_re_search)
4498 val = re_match_2_internal(bufp, string1, size1, string2, size2,
4499 startpos, regs, stop);
4500 #ifndef REGEX_MALLOC
4515 else if (range > 0) {
4516 d = ((const unsigned char *)
4518 size1 ? string2 - size1 : string1) + startpos);
4519 d_size = charcount_to_bytecount(d, 1);
4523 /* Note startpos > size1 not >=. If we are on the
4524 string1/string2 boundary, we want to backup into string1. */
4525 d = ((const unsigned char *)
4527 size1 ? string2 - size1 : string1) + startpos);
4529 d_size = charcount_to_bytecount(d, 1);
4537 /* Declarations and macros for re_match_2. */
4539 /* This converts PTR, a pointer into one of the search strings `string1'
4540 and `string2' into an offset from the beginning of that string. */
4541 #define POINTER_TO_OFFSET(ptr) \
4542 (FIRST_STRING_P (ptr) \
4543 ? ((regoff_t) ((ptr) - string1)) \
4544 : ((regoff_t) ((ptr) - string2 + size1)))
4546 /* Macros for dealing with the split strings in re_match_2. */
4548 #define MATCHING_IN_FIRST_STRING (dend == end_match_1)
4550 /* Call before fetching a character with *d. This switches over to
4551 string2 if necessary. */
4552 #define REGEX_PREFETCH() \
4555 /* End of string2 => fail. */ \
4556 if (dend == end_match_2) \
4558 /* End of string1 => advance to string2. */ \
4560 dend = end_match_2; \
4563 /* Test if at very beginning or at very end of the virtual concatenation
4564 of `string1' and `string2'. If only one string, it's `string2'. */
4565 #define AT_STRINGS_BEG(d) ((d) == (size1 ? string1 : string2) || !size2)
4566 #define AT_STRINGS_END(d) ((d) == end2)
4569 If the given position straddles the string gap, return the equivalent
4570 position that is before or after the gap, respectively; otherwise,
4571 return the same position. */
4572 #define POS_BEFORE_GAP_UNSAFE(d) ((d) == string2 ? end1 : (d))
4573 #define POS_AFTER_GAP_UNSAFE(d) ((d) == end1 ? string2 : (d))
4575 /* Test if CH is a word-constituent character. (XEmacs change) */
4576 #define WORDCHAR_P_UNSAFE(ch) \
4577 (SYNTAX_UNSAFE (XCHAR_TABLE (regex_emacs_buffer->mirror_syntax_table), \
4580 /* Free everything we malloc. */
4581 #ifdef MATCH_MAY_ALLOCATE
4582 #define FREE_VAR(var) if (var) REGEX_FREE (var); var = NULL
4583 #define FREE_VARIABLES() \
4585 REGEX_FREE_STACK (fail_stack.stack); \
4586 FREE_VAR (regstart); \
4587 FREE_VAR (regend); \
4588 FREE_VAR (old_regstart); \
4589 FREE_VAR (old_regend); \
4590 FREE_VAR (best_regstart); \
4591 FREE_VAR (best_regend); \
4592 FREE_VAR (reg_info); \
4593 FREE_VAR (reg_dummy); \
4594 FREE_VAR (reg_info_dummy); \
4596 #else /* not MATCH_MAY_ALLOCATE */
4597 #define FREE_VARIABLES() ((void)0) /* Do nothing! But inhibit gcc warning. */
4598 #endif /* MATCH_MAY_ALLOCATE */
4600 /* These values must meet several constraints. They must not be valid
4601 register values; since we have a limit of 255 registers (because
4602 we use only one byte in the pattern for the register number), we can
4603 use numbers larger than 255. They must differ by 1, because of
4604 NUM_FAILURE_ITEMS above. And the value for the lowest register must
4605 be larger than the value for the highest register, so we do not try
4606 to actually save any registers when none are active. */
4607 #define NO_HIGHEST_ACTIVE_REG (1 << BYTEWIDTH)
4608 #define NO_LOWEST_ACTIVE_REG (NO_HIGHEST_ACTIVE_REG + 1)
4610 /* Matching routines. */
4612 #ifndef emacs /* Emacs never uses this. */
4613 /* re_match is like re_match_2 except it takes only a single string. */
4616 re_match(struct re_pattern_buffer *bufp, const char *string, int size,
4617 int pos, struct re_registers *regs)
4620 re_match_2_internal(bufp, NULL, 0, (re_char *) string, size,
4625 #endif /* not emacs */
4627 /* re_match_2 matches the compiled pattern in BUFP against the
4628 (virtual) concatenation of STRING1 and STRING2 (of length SIZE1 and
4629 SIZE2, respectively). We start matching at POS, and stop matching
4632 If REGS is non-null and the `no_sub' field of BUFP is nonzero, we
4633 store offsets for the substring each group matched in REGS. See the
4634 documentation for exactly how many groups we fill.
4636 We return -1 if no match, -2 if an internal error (such as the
4637 failure stack overflowing). Otherwise, we return the length of the
4638 matched substring. */
4641 re_match_2(struct re_pattern_buffer *bufp, const char *string1,
4642 int size1, const char *string2, int size2, int pos,
4643 struct re_registers *regs, int stop)
4648 SETUP_SYNTAX_CACHE_FOR_OBJECT(regex_match_object,
4650 SYNTAX_CACHE_OBJECT_BYTE_TO_CHAR
4651 (regex_match_object, regex_emacs_buffer,
4655 result = re_match_2_internal(bufp, (re_char *) string1, size1,
4656 (re_char *) string2, size2,
4663 /* This is a separate function so that we can force an alloca cleanup
4666 re_match_2_internal(struct re_pattern_buffer *bufp, re_char * string1,
4667 int size1, re_char * string2, int size2, int pos,
4668 struct re_registers *regs, int stop)
4670 /* General temporaries. */
4673 int should_succeed; /* XEmacs change */
4675 /* Just past the end of the corresponding string. */
4676 re_char *end1, *end2;
4678 /* Pointers into string1 and string2, just past the last characters in
4679 each to consider matching. */
4680 re_char *end_match_1, *end_match_2;
4682 /* Where we are in the data, and the end of the current string. */
4683 const re_char *d, *dend;
4685 /* Where we are in the pattern, and the end of the pattern. */
4686 unsigned char *p = bufp->buffer;
4687 REGISTER unsigned char *pend = p + bufp->used;
4689 /* Mark the opcode just after a start_memory, so we can test for an
4690 empty subpattern when we get to the stop_memory. */
4691 re_char *just_past_start_mem = 0;
4693 /* We use this to map every character in the string. */
4694 RE_TRANSLATE_TYPE translate = bufp->translate;
4696 /* Failure point stack. Each place that can handle a failure further
4697 down the line pushes a failure point on this stack. It consists of
4698 restart, regend, and reg_info for all registers corresponding to
4699 the subexpressions we're currently inside, plus the number of such
4700 registers, and, finally, two char *'s. The first char * is where
4701 to resume scanning the pattern; the second one is where to resume
4702 scanning the strings. If the latter is zero, the failure point is
4703 a ``dummy''; if a failure happens and the failure point is a dummy,
4704 it gets discarded and the next one is tried. */
4705 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4706 fail_stack_type fail_stack;
4708 #ifdef REGEX_DEBUG_FLAG
4709 static unsigned failure_id;
4710 int nfailure_points_pushed = 0, nfailure_points_popped = 0;
4714 /* This holds the pointer to the failure stack, when
4715 it is allocated relocatably. */
4716 fail_stack_elt_t *failure_stack_ptr;
4719 /* We fill all the registers internally, independent of what we
4720 return, for use in backreferences. The number here includes
4721 an element for register zero. */
4722 int num_regs = bufp->re_ngroups + 1;
4724 /* The currently active registers. */
4725 int lowest_active_reg = NO_LOWEST_ACTIVE_REG;
4726 int highest_active_reg = NO_HIGHEST_ACTIVE_REG;
4728 /* Information on the contents of registers. These are pointers into
4729 the input strings; they record just what was matched (on this
4730 attempt) by a subexpression part of the pattern, that is, the
4731 regnum-th regstart pointer points to where in the pattern we began
4732 matching and the regnum-th regend points to right after where we
4733 stopped matching the regnum-th subexpression. (The zeroth register
4734 keeps track of what the whole pattern matches.) */
4735 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4736 re_char **regstart, **regend;
4739 /* If a group that's operated upon by a repetition operator fails to
4740 match anything, then the register for its start will need to be
4741 restored because it will have been set to wherever in the string we
4742 are when we last see its open-group operator. Similarly for a
4744 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4745 re_char **old_regstart, **old_regend;
4748 /* The is_active field of reg_info helps us keep track of which (possibly
4749 nested) subexpressions we are currently in. The matched_something
4750 field of reg_info[reg_num] helps us tell whether or not we have
4751 matched any of the pattern so far this time through the reg_num-th
4752 subexpression. These two fields get reset each time through any
4753 loop their register is in. */
4754 #ifdef MATCH_MAY_ALLOCATE /* otherwise, this is global. */
4755 register_info_type *reg_info;
4758 /* The following record the register info as found in the above
4759 variables when we find a match better than any we've seen before.
4760 This happens as we backtrack through the failure points, which in
4761 turn happens only if we have not yet matched the entire string. */
4762 unsigned best_regs_set = false;
4763 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4764 re_char **best_regstart, **best_regend;
4767 /* Logically, this is `best_regend[0]'. But we don't want to have to
4768 allocate space for that if we're not allocating space for anything
4769 else (see below). Also, we never need info about register 0 for
4770 any of the other register vectors, and it seems rather a kludge to
4771 treat `best_regend' differently than the rest. So we keep track of
4772 the end of the best match so far in a separate variable. We
4773 initialize this to NULL so that when we backtrack the first time
4774 and need to test it, it's not garbage. */
4775 re_char *match_end = NULL;
4777 /* This helps SET_REGS_MATCHED avoid doing redundant work. */
4778 int set_regs_matched_done = 0;
4780 /* Used when we pop values we don't care about. */
4781 #ifdef MATCH_MAY_ALLOCATE /* otherwise, these are global. */
4782 re_char **reg_dummy;
4783 register_info_type *reg_info_dummy;
4786 #ifdef REGEX_DEBUG_FLAG
4787 /* Counts the total number of registers pushed. */
4788 unsigned num_regs_pushed = 0;
4791 /* 1 if this match ends in the same string (string1 or string2)
4792 as the best previous match. */
4795 /* 1 if this match is the best seen so far. */
4796 re_bool best_match_p;
4798 DEBUG_PRINT1("\n\nEntering re_match_2.\n");
4802 #ifdef MATCH_MAY_ALLOCATE
4803 /* Do not bother to initialize all the register variables if there are
4804 no groups in the pattern, as it takes a fair amount of time. If
4805 there are groups, we include space for register 0 (the whole
4806 pattern), even though we never use it, since it simplifies the
4807 array indexing. We should fix this. */
4808 if (bufp->re_ngroups) {
4809 regstart = REGEX_TALLOC(num_regs, re_char *);
4810 regend = REGEX_TALLOC(num_regs, re_char *);
4811 old_regstart = REGEX_TALLOC(num_regs, re_char *);
4812 old_regend = REGEX_TALLOC(num_regs, re_char *);
4813 best_regstart = REGEX_TALLOC(num_regs, re_char *);
4814 best_regend = REGEX_TALLOC(num_regs, re_char *);
4815 reg_info = REGEX_TALLOC(num_regs, register_info_type);
4816 reg_dummy = REGEX_TALLOC(num_regs, re_char *);
4817 reg_info_dummy = REGEX_TALLOC(num_regs, register_info_type);
4820 (regstart && regend && old_regstart && old_regend
4821 && reg_info && best_regstart && best_regend && reg_dummy
4822 && reg_info_dummy)) {
4827 /* We must initialize all our variables to NULL, so that
4828 `FREE_VARIABLES' doesn't try to free them. */
4829 regstart = regend = old_regstart = old_regend = best_regstart
4830 = best_regend = reg_dummy = NULL;
4831 reg_info = reg_info_dummy = (register_info_type *) NULL;
4833 #endif /* MATCH_MAY_ALLOCATE */
4835 /* The starting position is bogus. */
4836 if (pos < 0 || pos > size1 + size2) {
4841 /* Initialize subexpression text positions to -1 to mark ones that no
4842 start_memory/stop_memory has been seen for. Also initialize the
4843 register information struct. */
4844 for (mcnt = 1; mcnt < num_regs; mcnt++) {
4845 regstart[mcnt] = regend[mcnt]
4846 = old_regstart[mcnt] = old_regend[mcnt] = REG_UNSET_VALUE;
4848 REG_MATCH_NULL_STRING_P(reg_info[mcnt]) =
4849 MATCH_NULL_UNSET_VALUE;
4850 IS_ACTIVE(reg_info[mcnt]) = 0;
4851 MATCHED_SOMETHING(reg_info[mcnt]) = 0;
4852 EVER_MATCHED_SOMETHING(reg_info[mcnt]) = 0;
4854 /* We move `string1' into `string2' if the latter's empty -- but not if
4855 `string1' is null. */
4856 if (size2 == 0 && string1 != NULL) {
4862 end1 = string1 + size1;
4863 end2 = string2 + size2;
4865 /* Compute where to stop matching, within the two strings. */
4866 if (stop <= size1) {
4867 end_match_1 = string1 + stop;
4868 end_match_2 = string2;
4871 end_match_2 = string2 + stop - size1;
4874 /* `p' scans through the pattern as `d' scans through the data.
4875 `dend' is the end of the input string that `d' points within. `d'
4876 is advanced into the following input string whenever necessary, but
4877 this happens before fetching; therefore, at the beginning of the
4878 loop, `d' can be pointing at the end of a string, but it cannot
4880 if (size1 > 0 && pos <= size1) {
4884 d = string2 + pos - size1;
4888 DEBUG_PRINT1("The compiled pattern is: \n");
4889 DEBUG_PRINT_COMPILED_PATTERN(bufp, p, pend);
4890 DEBUG_PRINT1("The string to match is: `");
4891 DEBUG_PRINT_DOUBLE_STRING(d, string1, size1, string2, size2);
4892 DEBUG_PRINT1("'\n");
4894 /* This loops over pattern commands. It exits by returning from the
4895 function if the match is complete, or it drops through if the match
4896 fails at this starting point in the input data. */
4898 DEBUG_PRINT2("\n0x%lx: ", (long)p);
4899 #ifdef emacs /* XEmacs added, w/removal of immediate_quit */
4900 if (!no_quit_in_re_search)
4905 /* End of pattern means we might have succeeded. */
4906 DEBUG_PRINT1("end of pattern ... ");
4908 /* If we haven't matched the entire string, and we want
4909 the longest match, try backtracking. */
4910 if (d != end_match_2) {
4911 same_str_p = (FIRST_STRING_P(match_end)
4912 == MATCHING_IN_FIRST_STRING);
4914 /* AIX compiler got confused when this was
4915 combined with the previous declaration. */
4917 best_match_p = d > match_end;
4920 !MATCHING_IN_FIRST_STRING;
4922 DEBUG_PRINT1("backtracking.\n");
4924 if (!FAIL_STACK_EMPTY()) {
4925 /* More failure points to try. */
4927 /* If exceeds best match so far, save
4929 if (!best_regs_set || best_match_p) {
4930 best_regs_set = true;
4934 ("\nSAVING match as best so far.\n");
4936 for (mcnt = 1; mcnt < num_regs;
4938 best_regstart[mcnt] =
4947 /* If no failure points, don't restore garbage.
4948 And if last match is real best match, don't
4949 restore second best one. */
4950 else if (best_regs_set && !best_match_p) {
4952 /* Restore best match. It may happen
4953 that `dend == end_match_1' while the
4954 restored d is in string2. For
4955 example, the pattern `x.*y.*z'
4956 against the strings `x-' and `y-z-',
4957 if the two strings are not
4958 consecutive in memory. */
4960 ("Restoring best registers.\n");
4963 dend = ((d >= string1 && d <= end1)
4964 ? end_match_1 : end_match_2);
4966 for (mcnt = 1; mcnt < num_regs; mcnt++) {
4968 best_regstart[mcnt];
4974 /* d != end_match_2 */
4976 DEBUG_PRINT1("Accepting match.\n");
4978 int num_nonshy_regs = bufp->re_nsub + 1;
4979 /* If caller wants register contents data back,
4981 if (regs && !bufp->no_sub) {
4982 /* Have the register data arrays been
4984 if (bufp->regs_allocated == REGS_UNALLOCATED) {
4985 /* No. So allocate them with malloc.
4986 We need one extra element beyond
4987 `num_regs' for the `-1' marker GNU
4989 regs->num_regs = MAX(RE_NREGS, num_nonshy_regs + 1);
4990 regs->start = TALLOC(regs->num_regs, regoff_t);
4991 regs->end = TALLOC(regs->num_regs, regoff_t);
4992 if (regs->start == NULL || regs->end == NULL) {
4996 bufp->regs_allocated = REGS_REALLOCATE;
4997 } else if (bufp->regs_allocated == REGS_REALLOCATE) {
4998 /* Yes. If we need more elements than were already
4999 allocated, reallocate them. If we need fewer, just
5001 if (regs->num_regs < num_nonshy_regs + 1) {
5002 regs->num_regs = num_nonshy_regs + 1;
5003 RETALLOC(regs->start, regs->num_regs, regoff_t);
5004 RETALLOC(regs->end, regs->num_regs, regoff_t);
5005 if (regs->start == NULL || regs->end == NULL) {
5011 /* The braces fend off a "empty body in an else-statement"
5012 warning under GCC when assert expands to nothing. */
5013 assert (bufp->regs_allocated == REGS_FIXED);
5016 /* Convert the pointer data in `regstart' and `regend' to
5017 indices. Register zero has to be set differently,
5018 since we haven't kept track of any info for it. */
5019 if (regs->num_regs > 0) {
5020 regs->start[0] = pos;
5021 regs->end[0] = (MATCHING_IN_FIRST_STRING
5022 ? ((regoff_t) (d - string1))
5023 : ((regoff_t) (d - string2 + size1)));
5026 /* Map over the NUM_NONSHY_REGS non-shy internal registers.
5027 Copy each into the corresponding external register.
5028 N.B. MCNT indexes external registers. */
5030 mcnt < MIN (num_nonshy_regs, regs->num_regs);
5032 int ireg = bufp->external_to_internal_register[mcnt];
5034 if (REG_UNSET (regstart[ireg]) || REG_UNSET (regend[ireg]))
5035 regs->start[mcnt] = regs->end[mcnt] = -1;
5038 = (regoff_t) POINTER_TO_OFFSET (regstart[ireg]);
5040 = (regoff_t) POINTER_TO_OFFSET (regend[ireg]);
5043 } /* regs && !bufp->no_sub */
5045 /* If we have regs and the regs structure has
5046 more elements than were in the pattern, set
5047 the extra elements to -1. If we
5048 (re)allocated the registers, this is the
5049 case, because we always allocate enough to
5050 have at least one -1 at the end.
5052 We do this even when no_sub is set because
5053 some applications (XEmacs) reuse register
5054 structures which may contain stale
5055 information, and permit attempts to access
5058 It would be possible to require the caller to
5059 do this, but we'd have to change the API for
5060 this function to reflect that, and audit all
5062 if (regs && regs->num_regs > 0)
5063 for (mcnt = num_nonshy_regs; mcnt < regs->num_regs; mcnt++)
5064 regs->start[mcnt] = regs->end[mcnt] = -1;
5067 DEBUG_PRINT4("%u failure points pushed, %u popped (%u remain).\n",
5068 nfailure_points_pushed, nfailure_points_popped,
5069 nfailure_points_pushed - nfailure_points_popped);
5070 DEBUG_PRINT2("%u registers pushed.\n", num_regs_pushed);
5072 mcnt = d - pos - (MATCHING_IN_FIRST_STRING
5076 DEBUG_PRINT2("Returning %d from re_match_2.\n", mcnt);
5082 /* Otherwise match next pattern command. */
5083 switch (SWITCH_ENUM_CAST((re_opcode_t) *p++)) {
5084 /* Ignore these. Used to ignore the n of succeed_n's
5085 which currently have n == 0. */
5087 DEBUG_PRINT1("EXECUTING no_op.\n");
5091 DEBUG_PRINT1("EXECUTING succeed.\n");
5094 /* Match the next n pattern characters exactly. The
5095 following byte in the pattern defines n, and the n
5096 bytes after that are the characters to match. */
5099 DEBUG_PRINT2("EXECUTING exactn %d.\n", mcnt);
5101 /* This is written out as an if-else so we don't waste
5102 time testing `translate' inside the loop. */
5103 if (TRANSLATE_P(translate)) {
5106 Emchar pat_ch, buf_ch;
5110 pat_ch = charptr_emchar(p);
5111 buf_ch = charptr_emchar(d);
5112 if (RE_TRANSLATE(buf_ch) != pat_ch)
5115 pat_len = charcount_to_bytecount(p, 1);
5120 #else /* not MULE */
5122 if ((unsigned char)RE_TRANSLATE(*d++) !=
5140 /* Match any character except possibly a newline or a
5143 DEBUG_PRINT1("EXECUTING anychar.\n");
5147 if ((!(bufp->syntax & RE_DOT_NEWLINE)
5148 && TRANSLATE(*d) == '\n')
5149 || (bufp->syntax & RE_DOT_NOT_NULL
5150 && TRANSLATE(*d) == '\000'))
5154 DEBUG_PRINT2(" Matched `%d'.\n", *d);
5155 INC_CHARPTR(d); /* XEmacs change */
5160 REGISTER unsigned char c;
5162 (re_opcode_t) * (p - 1) == charset_not;
5164 DEBUG_PRINT2("EXECUTING charset%s.\n",
5165 not_p ? "_not" : "");
5168 /* The character to match. */
5171 /* Cast to `unsigned' instead of `unsigned char'
5172 in case the bit list is a full 32 bytes
5174 if (c < (unsigned)(*p * BYTEWIDTH)
5176 c / BYTEWIDTH] & (1 << (c %
5186 INC_CHARPTR(d); /* XEmacs change */
5192 case charset_mule_not: {
5195 (re_opcode_t) * (p - 1) == charset_mule_not;
5197 DEBUG_PRINT2("EXECUTING charset_mule%s.\n",
5198 not_p ? "_not" : "");
5201 c = charptr_emchar((const Bufbyte *)d);
5202 /* The character to match. */
5207 unified_range_table_lookup(p, c, Qnil)))
5210 p += unified_range_table_bytes_used(p);
5221 /* The beginning of a group is represented by
5222 start_memory. The arguments are the register number
5223 in the next byte, and the number of groups inner to
5224 this one in the next. The text matched within the
5225 group is recorded (in the internal registers data
5226 structure) under the register number. */
5228 DEBUG_PRINT3("EXECUTING start_memory %d (%d):\n", *p,
5231 /* Find out if this group can match the empty string. */
5232 p1 = p; /* To send to group_match_null_string_p. */
5234 if (REG_MATCH_NULL_STRING_P(reg_info[*p]) ==
5235 MATCH_NULL_UNSET_VALUE)
5236 REG_MATCH_NULL_STRING_P(reg_info[*p])
5237 = group_match_null_string_p(&p1, pend,
5240 /* Save the position in the string where we were the
5241 last time we were at this open-group operator in case
5242 the group is operated upon by a repetition operator,
5243 e.g., with `(a*)*b' against `ab'; then we want to
5244 ignore where we are now in the string in case this
5245 attempt to match fails. */
5246 old_regstart[*p] = REG_MATCH_NULL_STRING_P(reg_info[*p])
5247 ? REG_UNSET(regstart[*p]) ? d : regstart[*p]
5249 DEBUG_PRINT2(" old_regstart: %d\n",
5250 POINTER_TO_OFFSET(old_regstart[*p]));
5253 DEBUG_PRINT2(" regstart: %d\n",
5254 POINTER_TO_OFFSET(regstart[*p]));
5256 IS_ACTIVE(reg_info[*p]) = 1;
5257 MATCHED_SOMETHING(reg_info[*p]) = 0;
5259 /* Clear this whenever we change the register activity
5261 set_regs_matched_done = 0;
5263 /* This is the new highest active register. */
5264 highest_active_reg = *p;
5266 /* If nothing was active before, this is the new lowest
5268 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5269 lowest_active_reg = *p;
5271 /* Move past the register number and inner group
5274 just_past_start_mem = p;
5278 /* The stop_memory opcode represents the end of a group.
5279 Its arguments are the same as start_memory's: the
5280 register number, and the number of inner groups. */
5282 DEBUG_PRINT3("EXECUTING stop_memory %d (%d):\n", *p,
5285 /* We need to save the string position the last time we
5286 were at this close-group operator in case the group
5287 is operated upon by a repetition operator, e.g., with
5288 `((a*)*(b*)*)*' against `aba'; then we want to ignore
5289 where we are now in the string in case this attempt
5291 old_regend[*p] = REG_MATCH_NULL_STRING_P(reg_info[*p])
5292 ? REG_UNSET(regend[*p]) ? d : regend[*p]
5294 DEBUG_PRINT2(" old_regend: %d\n",
5295 POINTER_TO_OFFSET(old_regend[*p]));
5298 DEBUG_PRINT2(" regend: %d\n",
5299 POINTER_TO_OFFSET(regend[*p]));
5301 /* This register isn't active anymore. */
5302 IS_ACTIVE(reg_info[*p]) = 0;
5304 /* Clear this whenever we change the register activity
5306 set_regs_matched_done = 0;
5308 /* If this was the only register active, nothing is
5310 if (lowest_active_reg == highest_active_reg) {
5311 lowest_active_reg = NO_LOWEST_ACTIVE_REG;
5312 highest_active_reg = NO_HIGHEST_ACTIVE_REG;
5313 } else { /* We must scan for the new highest
5314 active register, since it isn't
5315 necessarily one less than now:
5316 consider (a(b)c(d(e)f)g). When group
5317 3 ends, after the f), the new highest
5318 active register is 1. */
5319 unsigned char r = *p - 1;
5320 while (r > 0 && !IS_ACTIVE(reg_info[r]))
5323 /* If we end up at register zero, that means
5324 that we saved the registers as the result of
5325 an `on_failure_jump', not a `start_memory',
5326 and we jumped to past the innermost
5327 `stop_memory'. For example, in ((.)*) we
5328 save registers 1 and 2 as a result of the *,
5329 but when we pop back to the second ), we are
5330 at the stop_memory 1. Thus, nothing is
5334 NO_LOWEST_ACTIVE_REG;
5335 highest_active_reg =
5336 NO_HIGHEST_ACTIVE_REG;
5338 highest_active_reg = r;
5340 /* 98/9/21 jhod: We've also gotta set
5341 lowest_active_reg, don't we? */
5343 while (r < highest_active_reg
5344 && !IS_ACTIVE(reg_info[r]))
5346 lowest_active_reg = r;
5350 /* If just failed to match something this time around
5351 with a group that's operated on by a repetition
5352 operator, try to force exit from the ``loop'', and
5353 restore the register information for this group that
5354 we had before trying this last match. */
5355 if ((!MATCHED_SOMETHING(reg_info[*p])
5356 || just_past_start_mem == p - 1)
5357 && (p + 2) < pend) {
5358 re_bool is_a_jump_n = false;
5362 switch ((unsigned int)(re_opcode_t)*p1++) {
5365 case pop_failure_jump:
5366 case maybe_pop_jump:
5368 case dummy_failure_jump:
5369 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
5379 /* If the next operation is a jump backwards in
5380 the pattern to an on_failure_jump right
5381 before the start_memory corresponding to this
5382 stop_memory, exit from the loop by forcing a
5383 failure after pushing on the stack the
5384 on_failure_jump's jump in the pattern, and
5387 && (re_opcode_t) * p1 == on_failure_jump
5388 && (re_opcode_t) p1[3] == start_memory
5390 /* If this group ever matched anything,
5391 then restore what its registers were
5392 before trying this last failed match,
5393 e.g., with `(a*)*b' against `ab' for
5394 regstart[1], and, e.g., with
5395 `((a*)*(b*)*)*' against `aba' for
5398 Also restore the registers for inner
5399 groups for, e.g., `((a*)(b*))*'
5400 against `aba' (register 3 would
5401 otherwise get trashed). */
5403 if (EVER_MATCHED_SOMETHING
5407 EVER_MATCHED_SOMETHING(reg_info
5411 /* Restore this and inner
5414 for (r = *p; r < *p + *(p + 1);
5419 /* xx why this test? */
5420 if (old_regend[r] >=
5428 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
5429 PUSH_FAILURE_POINT(p1 + mcnt, d, -2);
5435 /* Move past the register number and the inner group
5440 /* \<digit> has been turned into a `duplicate' command
5441 which is followed by the numeric value of <digit> as
5442 the register number. (Already passed through
5443 external-to-internal-register mapping, so it refers
5444 to the actual group number, not the non-shy-only
5445 numbering used in the external world.) */
5448 REGISTER re_char *d2, *dend2;
5449 /* Get which register to match against. */
5451 DEBUG_PRINT2("EXECUTING duplicate %d.\n",
5454 /* Can't back reference a group which we've
5456 if (REG_UNSET(regstart[regno])
5457 || REG_UNSET(regend[regno]))
5460 /* Where in input to try to start matching. */
5461 d2 = regstart[regno];
5463 /* Where to stop matching; if both the place to
5464 start and the place to stop matching are in
5465 the same string, then set to the place to
5466 stop, otherwise, for now have to use the end
5467 of the first string. */
5469 dend2 = ((FIRST_STRING_P(regstart[regno])
5470 == FIRST_STRING_P(regend[regno]))
5471 ? regend[regno] : end_match_1);
5473 /* If necessary, advance to next segment
5474 in register contents. */
5475 while (d2 == dend2) {
5476 if (dend2 == end_match_2)
5478 if (dend2 == regend[regno])
5481 /* End of string1 => advance to
5484 dend2 = regend[regno];
5486 /* At end of register contents =>
5491 /* If necessary, advance to next segment
5495 /* How many characters left in this segment to match. */
5498 /* Want how many consecutive characters
5499 we can match in one shot, so, if
5500 necessary, adjust the count. */
5501 if (mcnt > dend2 - d2)
5504 /* Compare that many; failure if
5505 mismatch, else move past them. */
5506 if (TRANSLATE_P(translate)
5508 (const unsigned char*)d,
5509 (const unsigned char*)d2,
5511 : memcmp(d, d2, mcnt)) {
5514 d += mcnt, d2 += mcnt;
5516 /* Do this because we've match some
5523 /* begline matches the empty string at the beginning of
5524 the string (unless `not_bol' is set in `bufp'), and,
5525 if `newline_anchor' is set, after newlines. */
5527 DEBUG_PRINT1("EXECUTING begline.\n");
5529 if (AT_STRINGS_BEG(d)) {
5532 } else if (d[-1] == '\n' && bufp->newline_anchor) {
5535 /* In all other cases, we fail. */
5538 /* endline is the dual of begline. */
5540 DEBUG_PRINT1("EXECUTING endline.\n");
5542 if (AT_STRINGS_END(d)) {
5547 /* We have to ``prefetch'' the next character. */
5548 else if ((d == end1 ? *string2 : *d) == '\n'
5549 && bufp->newline_anchor) {
5554 /* Match at the very beginning of the data. */
5556 DEBUG_PRINT1("EXECUTING begbuf.\n");
5557 if (AT_STRINGS_BEG(d))
5561 /* Match at the very end of the data. */
5563 DEBUG_PRINT1("EXECUTING endbuf.\n");
5564 if (AT_STRINGS_END(d))
5568 /* on_failure_keep_string_jump is used to optimize
5569 `.*\n'. It pushes NULL as the value for the string
5570 on the stack. Then `pop_failure_point' will keep the
5571 current value for the string, instead of restoring
5572 it. To see why, consider matching `foo\nbar' against
5573 `.*\n'. The .* matches the foo; then the . fails
5574 against the \n. But the next thing we want to do is
5575 match the \n against the \n; if we restored the
5576 string value, we would be back at the foo.
5578 Because this is used only in specific cases, we don't
5579 need to check all the things that `on_failure_jump'
5580 does, to make sure the right things get saved on the
5581 stack. Hence we don't share its code. The only
5582 reason to push anything on the stack at all is that
5583 otherwise we would have to change `anychar's code to
5584 do something besides goto fail in this case; that
5585 seems worse than this. */
5586 case on_failure_keep_string_jump:
5587 DEBUG_PRINT1("EXECUTING on_failure_keep_string_jump");
5589 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5590 DEBUG_PRINT3(" %d (to 0x%lx):\n", mcnt,
5593 PUSH_FAILURE_POINT(p + mcnt, (unsigned char *)0, -2);
5596 /* Uses of on_failure_jump:
5598 Each alternative starts with an on_failure_jump that
5599 points to the beginning of the next alternative.
5600 Each alternative except the last ends with a jump
5601 that in effect jumps past the rest of the
5602 alternatives. (They really jump to the ending jump
5603 of the following alternative, because tensioning
5604 these jumps is a hassle.)
5606 Repeats start with an on_failure_jump that points
5607 past both the repetition text and either the
5608 following jump or pop_failure_jump back to this
5610 case on_failure_jump:
5612 DEBUG_PRINT1("EXECUTING on_failure_jump");
5614 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5615 DEBUG_PRINT3(" %d (to 0x%lx)", mcnt, (long)(p + mcnt));
5617 /* If this on_failure_jump comes right before a group
5618 (i.e., the original * applied to a group), save the
5619 information for that group and all inner ones, so
5620 that if we fail back to this point, the group's
5621 information will be correct. For example, in
5622 \(a*\)*\1, we need the preceding group, and in
5623 \(\(a*\)b*\)\2, we need the inner group. */
5625 /* We can't use `p' to check ahead because we push
5626 a failure point to `p + mcnt' after we do this. */
5629 /* We need to skip no_op's before we look for the
5630 start_memory in case this on_failure_jump is
5631 happening as the result of a completed succeed_n, as
5632 in \(a\)\{1,3\}b\1 against aba. */
5633 while (p1 < pend && (re_opcode_t) * p1 == no_op)
5636 if (p1 < pend && (re_opcode_t) * p1 == start_memory) {
5637 /* We have a new highest active register now.
5638 This will get reset at the start_memory we
5639 are about to get to, but we will have saved
5640 all the registers relevant to this repetition
5641 op, as described above. */
5642 highest_active_reg = *(p1 + 1) + *(p1 + 2);
5643 if (lowest_active_reg == NO_LOWEST_ACTIVE_REG)
5644 lowest_active_reg = *(p1 + 1);
5647 DEBUG_PRINT1(":\n");
5648 PUSH_FAILURE_POINT(p + mcnt, d, -2);
5651 /* A smart repeat ends with `maybe_pop_jump'.
5652 We change it to either `pop_failure_jump' or `jump'. */
5653 case maybe_pop_jump:
5654 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5655 DEBUG_PRINT2("EXECUTING maybe_pop_jump %d.\n", mcnt);
5657 REGISTER const unsigned char *p2 = p;
5659 /* Compare the beginning of the repeat with what
5660 in the pattern follows its end. If we can
5661 establish that there is nothing that they
5662 would both match, i.e., that we would have to
5663 backtrack because of (as in, e.g., `a*a')
5664 then we can change to pop_failure_jump,
5665 because we'll never have to backtrack.
5667 This is not true in the case of alternatives:
5668 in `(a|ab)*' we do need to backtrack to the
5669 `ab' alternative (e.g., if the string was
5670 `ab'). But instead of trying to detect that
5671 here, the alternative has put on a dummy
5672 failure point which is what we will end up
5675 /* Skip over open/close-group commands. If what
5676 follows this loop is a ...+ construct, look
5677 at what begins its body, since we will have
5678 to match at least one of that. */
5681 && ((re_opcode_t) * p2 ==
5683 || (re_opcode_t) * p2 ==
5686 else if (p2 + 6 < pend
5687 && (re_opcode_t) * p2 ==
5695 /* p1[0] ... p1[2] are the `on_failure_jump' corresponding
5696 to the `maybe_finalize_jump' of this case. Examine what
5699 /* If we're at the end of the pattern, we can change. */
5701 /* Consider what happens when matching ":\(.*\)"
5702 against ":/". I don't really understand this code
5704 p[-3] = (unsigned char)pop_failure_jump;
5706 (" End of pattern: change to `pop_failure_jump'.\n");
5709 else if ((re_opcode_t) * p2 == exactn
5710 || (bufp->newline_anchor
5711 && (re_opcode_t) * p2 ==
5713 REGISTER unsigned char c =
5715 (unsigned char)endline ? '\n' :
5718 if ((re_opcode_t) p1[3] == exactn
5724 (" %c != %c => pop_failure_jump.\n",
5728 else if ((re_opcode_t) p1[3] == charset
5729 || (re_opcode_t) p1[3] ==
5732 (re_opcode_t) p1[3] ==
5736 (unsigned char)(p1[4] *
5740 BYTEWIDTH] & (1 << (c
5745 /* `not_p' is equal to 1 if c would match, which means
5746 that we can't change to pop_failure_jump. */
5752 (" No match => pop_failure_jump.\n");
5755 } else if ((re_opcode_t) * p2 == charset) {
5756 #ifdef REGEX_DEBUG_FLAG
5757 REGISTER unsigned char c
5760 (unsigned char)endline ? '\n' :
5764 if ((re_opcode_t) p1[3] == exactn
5765 && !((int)p2[1] * BYTEWIDTH >
5767 && (p2[2 + p1[5] / BYTEWIDTH]
5775 (" %c != %c => pop_failure_jump.\n",
5779 else if ((re_opcode_t) p1[3] ==
5782 /* We win if the charset_not inside the loop
5783 lists every character listed in the charset after. */
5784 for (idx = 0; idx < (int)p2[1];
5802 (" No match => pop_failure_jump.\n");
5804 } else if ((re_opcode_t) p1[3] ==
5807 /* We win if the charset inside the loop
5808 has no overlap with the one after the loop. */
5811 && idx < (int)p1[4]; idx++)
5822 (" No match => pop_failure_jump.\n");
5827 p -= 2; /* Point at relative address again. */
5828 if ((re_opcode_t) p[-1] != pop_failure_jump) {
5829 p[-1] = (unsigned char)jump;
5830 DEBUG_PRINT1(" Match => jump.\n");
5831 goto unconditional_jump;
5833 /* Note fall through. */
5835 /* The end of a simple repeat has a pop_failure_jump
5836 back to its matching on_failure_jump, where the
5837 latter will push a failure point. The
5838 pop_failure_jump takes off failure points put on by
5839 this pop_failure_jump's matching on_failure_jump; we
5840 got through the pattern to here from the matching
5841 on_failure_jump, so didn't fail. */
5842 case pop_failure_jump: {
5843 /* We need to pass separate storage for the
5844 lowest and highest registers, even though we
5845 don't care about the actual values.
5846 Otherwise, we will restore only one register
5847 from the stack, since lowest will == highest
5848 in `pop_failure_point'. */
5849 int dummy_low_reg, dummy_high_reg;
5850 const unsigned char *pdummy = NULL;
5851 const unsigned char *sdummy = NULL;
5853 DEBUG_PRINT1("EXECUTING pop_failure_jump.\n");
5854 POP_FAILURE_POINT(sdummy, pdummy,
5855 dummy_low_reg, dummy_high_reg,
5856 reg_dummy, reg_dummy,
5859 /* Note fall through. */
5861 /* Unconditionally jump (without popping any failure
5865 /* Get the amount to jump. */
5866 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5867 DEBUG_PRINT2("EXECUTING jump %d ", mcnt);
5868 p += mcnt; /* Do the jump. */
5869 DEBUG_PRINT2("(to 0x%lx).\n", (long)p);
5872 /* We need this opcode so we can detect where alternatives end
5873 in `group_match_null_string_p' et al. */
5875 DEBUG_PRINT1("EXECUTING jump_past_alt.\n");
5876 goto unconditional_jump;
5878 /* Normally, the on_failure_jump pushes a failure point, which
5879 then gets popped at pop_failure_jump. We will end up at
5880 pop_failure_jump, also, and with a pattern of, say, `a+', we
5881 are skipping over the on_failure_jump, so we have to push
5882 something meaningless for pop_failure_jump to pop. */
5883 case dummy_failure_jump:
5884 DEBUG_PRINT1("EXECUTING dummy_failure_jump.\n");
5885 /* It doesn't matter what we push for the string here. What
5886 the code at `fail' tests is the value for the pattern. */
5887 PUSH_FAILURE_POINT(NULL, NULL, -2);
5888 goto unconditional_jump;
5890 /* At the end of an alternative, we need to push a dummy failure
5891 point in case we are followed by a `pop_failure_jump', because
5892 we don't want the failure point for the alternative to be
5893 popped. For example, matching `(a|ab)*' against `aab'
5894 requires that we match the `ab' alternative. */
5895 case push_dummy_failure:
5896 DEBUG_PRINT1("EXECUTING push_dummy_failure.\n");
5897 /* See comments just above at `dummy_failure_jump' about the
5899 PUSH_FAILURE_POINT(NULL, NULL, -2);
5902 /* Have to succeed matching what follows at least n times.
5903 After that, handle like `on_failure_jump'. */
5905 EXTRACT_NUMBER(mcnt, p + 2);
5906 DEBUG_PRINT2("EXECUTING succeed_n %d.\n", mcnt);
5909 /* Originally, this is how many times we HAVE to succeed. */
5913 STORE_NUMBER_AND_INCR(p, mcnt);
5914 DEBUG_PRINT3(" Setting 0x%lx to %d.\n",
5916 } else if (mcnt == 0) {
5918 (" Setting two bytes from 0x%lx to no_op.\n",
5920 p[2] = (unsigned char)no_op;
5921 p[3] = (unsigned char)no_op;
5927 EXTRACT_NUMBER(mcnt, p + 2);
5928 DEBUG_PRINT2("EXECUTING jump_n %d.\n", mcnt);
5930 /* Originally, this is how many times we CAN jump. */
5933 STORE_NUMBER(p + 2, mcnt);
5934 goto unconditional_jump;
5936 /* If don't have to jump any more, skip over the rest of command. */
5943 DEBUG_PRINT1("EXECUTING set_number_at.\n");
5945 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5947 EXTRACT_NUMBER_AND_INCR(mcnt, p);
5948 DEBUG_PRINT3(" Setting 0x%lx to %d.\n",
5950 STORE_NUMBER(p1, mcnt);
5955 DEBUG_PRINT1("EXECUTING wordbound.\n");
5960 /* Straightforward and (I hope) correct implementation.
5961 Probably should be optimized by arranging to compute
5963 /* emch1 is the character before d, syn1 is the syntax of
5964 emch1, emch2 is the character at d, and syn2 is the
5966 Emchar emch1, emch2;
5967 /* GCC isn't smart enough to see these are initialized if used. */
5968 int syn1 = 0, syn2 = 0;
5969 re_char *d_before, *d_after;
5971 at_beg = AT_STRINGS_BEG(d),
5972 at_end = AT_STRINGS_END(d);
5977 if (at_beg && at_end) {
5982 POS_BEFORE_GAP_UNSAFE(d);
5983 DEC_CHARPTR(d_before);
5985 charptr_emchar(d_before);
5988 SYNTAX_CACHE_BYTE_TO_CHAR
5989 (PTR_TO_OFFSET(d)) - 1;
5990 UPDATE_SYNTAX_CACHE(xpos);
5992 syn1 = SYNTAX_FROM_CACHE
5994 (regex_emacs_buffer->
5995 mirror_syntax_table),
6000 POS_AFTER_GAP_UNSAFE(d);
6001 emch2 = charptr_emchar(d_after);
6004 SYNTAX_CACHE_BYTE_TO_CHAR
6006 UPDATE_SYNTAX_CACHE_FORWARD(xpos
6010 syn2 = SYNTAX_FROM_CACHE
6012 (regex_emacs_buffer->
6013 mirror_syntax_table),
6018 result = (syn2 == Sword);
6020 result = (syn1 == Sword);
6027 if (result == should_succeed)
6033 DEBUG_PRINT1("EXECUTING notwordbound.\n");
6035 goto matchwordbound;
6038 DEBUG_PRINT1("EXECUTING wordbeg.\n");
6039 if (AT_STRINGS_END(d))
6042 /* XEmacs: this originally read:
6044 if (WORDCHAR_P (d) && (AT_STRINGS_BEG (d) || !WORDCHAR_P (d - 1)))
6048 re_char *dtmp = POS_AFTER_GAP_UNSAFE(d);
6049 Emchar emch = charptr_emchar(dtmp);
6052 SYNTAX_CACHE_BYTE_TO_CHAR(PTR_TO_OFFSET(d));
6053 UPDATE_SYNTAX_CACHE(charpos);
6055 if (SYNTAX_FROM_CACHE
6057 (regex_emacs_buffer->mirror_syntax_table),
6060 if (AT_STRINGS_BEG(d))
6062 dtmp = POS_BEFORE_GAP_UNSAFE(d);
6064 emch = charptr_emchar(dtmp);
6066 UPDATE_SYNTAX_CACHE_BACKWARD(charpos - 1);
6068 if (SYNTAX_FROM_CACHE
6070 (regex_emacs_buffer->mirror_syntax_table),
6077 DEBUG_PRINT1("EXECUTING wordend.\n");
6078 if (AT_STRINGS_BEG(d))
6081 /* XEmacs: this originally read:
6083 if (!AT_STRINGS_BEG (d) && WORDCHAR_P (d - 1)
6084 && (!WORDCHAR_P (d) || AT_STRINGS_END (d)))
6087 The or condition is incorrect (reversed).
6093 SYNTAX_CACHE_BYTE_TO_CHAR(PTR_TO_OFFSET(d))
6095 UPDATE_SYNTAX_CACHE(charpos);
6097 dtmp = POS_BEFORE_GAP_UNSAFE(d);
6099 emch = charptr_emchar(dtmp);
6100 if (SYNTAX_FROM_CACHE
6102 (regex_emacs_buffer->mirror_syntax_table),
6105 if (AT_STRINGS_END(d))
6107 dtmp = POS_AFTER_GAP_UNSAFE(d);
6108 emch = charptr_emchar(dtmp);
6110 UPDATE_SYNTAX_CACHE_FORWARD(charpos + 1);
6112 if (SYNTAX_FROM_CACHE
6114 (regex_emacs_buffer->mirror_syntax_table),
6122 DEBUG_PRINT1("EXECUTING before_dot.\n");
6124 (NILP(regex_match_object)
6125 || BUFFERP(regex_match_object))
6126 || (BUF_PTR_BYTE_POS(regex_emacs_buffer, d)
6127 >= BUF_PT(regex_emacs_buffer)))
6132 DEBUG_PRINT1("EXECUTING at_dot.\n");
6134 (NILP(regex_match_object)
6135 || BUFFERP(regex_match_object))
6136 || (BUF_PTR_BYTE_POS(regex_emacs_buffer, d)
6137 != BUF_PT(regex_emacs_buffer)))
6142 DEBUG_PRINT1("EXECUTING after_dot.\n");
6144 (NILP(regex_match_object)
6145 || BUFFERP(regex_match_object))
6146 || (BUF_PTR_BYTE_POS(regex_emacs_buffer, d)
6147 <= BUF_PT(regex_emacs_buffer)))
6150 #if 0 /* not emacs19 */
6152 DEBUG_PRINT1("EXECUTING at_dot.\n");
6153 if (BUF_PTR_BYTE_POS
6154 (regex_emacs_buffer,
6155 (unsigned char *)d) + 1 !=
6156 BUF_PT(regex_emacs_buffer))
6159 #endif /* not emacs19 */
6162 DEBUG_PRINT2("EXECUTING syntaxspec %d.\n", mcnt);
6167 DEBUG_PRINT1("EXECUTING Emacs wordchar.\n");
6180 SYNTAX_CACHE_BYTE_TO_CHAR
6182 UPDATE_SYNTAX_CACHE(charpos);
6186 emch = charptr_emchar((const Bufbyte *)d);
6190 (regex_emacs_buffer->mirror_syntax_table),
6191 emch) == (enum syntaxcode)mcnt);
6193 if (matches != should_succeed)
6200 DEBUG_PRINT2("EXECUTING notsyntaxspec %d.\n", mcnt);
6202 goto matchnotsyntax;
6205 DEBUG_PRINT1("EXECUTING Emacs notwordchar.\n");
6209 goto matchornotsyntax;
6212 /* 97/2/17 jhod Mule category code patch */
6221 emch = charptr_emchar((const Bufbyte *)d);
6223 if (check_category_char
6224 (emch, regex_emacs_buffer->category_table,
6225 mcnt, should_succeed))
6231 case notcategoryspec:
6233 goto matchornotcategory;
6234 /* end of category patch */
6236 #else /* not emacs */
6238 DEBUG_PRINT1("EXECUTING non-Emacs wordchar.\n");
6240 if (!WORDCHAR_P_UNSAFE((int)(*d)))
6247 DEBUG_PRINT1("EXECUTING non-Emacs notwordchar.\n");
6249 if (!WORDCHAR_P_UNSAFE((int)(*d)))
6259 /* Successfully executed one pattern command; keep going. */
6262 /* We goto here if a matching operation fails. */
6264 if (!FAIL_STACK_EMPTY()) {
6265 /* A restart point is known. Restore to that state. */
6266 DEBUG_PRINT1("\nFAIL:\n");
6267 POP_FAILURE_POINT(d, p, lowest_active_reg,
6268 highest_active_reg, regstart, regend,
6271 /* If this failure point is a dummy, try the next one. */
6275 /* If we failed to the end of the pattern, don't examine
6279 re_bool is_a_jump_n = false;
6281 /* If failed to a backwards jump that's part of
6282 a repetition loop, need to pop this failure
6283 point and use the next one. */
6284 switch ((unsigned int)(re_opcode_t)*p) {
6287 case maybe_pop_jump:
6288 case pop_failure_jump:
6291 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6295 && (re_opcode_t) * p1 == succeed_n)
6297 && (re_opcode_t) * p1 ==
6306 if (d >= string1 && d <= end1)
6309 break; /* Matching at this starting point really fails. */
6313 goto restore_best_regs;
6317 return -1; /* Failure to match. */
6321 /* Subroutine definitions for re_match_2. */
6323 /* We are passed P pointing to a register number after a start_memory.
6325 Return true if the pattern up to the corresponding stop_memory can
6326 match the empty string, and false otherwise.
6328 If we find the matching stop_memory, sets P to point to one past its number.
6329 Otherwise, sets P to an undefined byte less than or equal to END.
6331 We don't handle duplicates properly (yet). */
6334 group_match_null_string_p(unsigned char **p, unsigned char *end,
6335 register_info_type * register_info)
6338 /* Point to after the args to the start_memory. */
6339 unsigned char *p1 = *p + 2;
6342 /* Skip over opcodes that can match nothing, and return true or
6343 false, as appropriate, when we get to one that can't, or to the
6344 matching stop_memory. */
6346 switch ((unsigned int)(re_opcode_t)*p1) {
6347 /* Could be either a loop or a series of alternatives. */
6348 case on_failure_jump:
6350 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6352 /* If the next operation is not a jump backwards in the
6356 /* Go through the on_failure_jumps of the alternatives,
6357 seeing if any of the alternatives cannot match nothing.
6358 The last alternative starts with only a jump,
6359 whereas the rest start with on_failure_jump and end
6360 with a jump, e.g., here is the pattern for `a|b|c':
6362 /on_failure_jump/0/6/exactn/1/a/jump_past_alt/0/6
6363 /on_failure_jump/0/6/exactn/1/b/jump_past_alt/0/3
6366 So, we have to first go through the first (n-1)
6367 alternatives and then deal with the last one separately. */
6369 /* Deal with the first (n-1) alternatives, which start
6370 with an on_failure_jump (see above) that jumps to right
6371 past a jump_past_alt. */
6373 while ((re_opcode_t) p1[mcnt - 3] ==
6375 /* `mcnt' holds how many bytes long the alternative
6376 is, including the ending `jump_past_alt' and
6379 if (!alt_match_null_string_p
6380 (p1, p1 + mcnt - 3, register_info))
6383 /* Move to right after this alternative, including the
6387 /* Break if it's the beginning of an n-th alternative
6388 that doesn't begin with an on_failure_jump. */
6389 if ((re_opcode_t) * p1 !=
6393 /* Still have to check that it's not an n-th
6394 alternative that starts with an on_failure_jump. */
6396 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6397 if ((re_opcode_t) p1[mcnt - 3] !=
6399 /* Get to the beginning of the n-th alternative. */
6405 /* Deal with the last alternative: go back and get number
6406 of the `jump_past_alt' just before it. `mcnt' contains
6407 the length of the alternative. */
6408 EXTRACT_NUMBER(mcnt, p1 - 2);
6410 if (!alt_match_null_string_p
6411 (p1, p1 + mcnt, register_info))
6414 p1 += mcnt; /* Get past the n-th alternative. */
6419 assert(p1[1] == **p);
6424 if (!common_op_match_null_string_p(&p1, end, register_info))
6427 } /* while p1 < end */
6430 } /* group_match_null_string_p */
6432 /* Similar to group_match_null_string_p, but doesn't deal with alternatives:
6433 It expects P to be the first byte of a single alternative and END one
6434 byte past the last. The alternative can contain groups. */
6437 alt_match_null_string_p(unsigned char *p, unsigned char *end,
6438 register_info_type * register_info)
6441 unsigned char *p1 = p;
6444 /* Skip over opcodes that can match nothing, and break when we get
6445 to one that can't. */
6447 switch ((unsigned int)(re_opcode_t)*p1) {
6449 case on_failure_jump:
6451 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6456 if (!common_op_match_null_string_p(&p1, end, register_info))
6459 } /* while p1 < end */
6462 } /* alt_match_null_string_p */
6464 /* Deals with the ops common to group_match_null_string_p and
6465 alt_match_null_string_p.
6467 Sets P to one after the op and its arguments, if any. */
6470 common_op_match_null_string_p(unsigned char **p, unsigned char *end,
6471 register_info_type * register_info)
6476 unsigned char *p1 = *p;
6478 switch ((unsigned int)(re_opcode_t)*p1++) {
6497 assert(reg_no > 0 && reg_no <= MAX_REGNUM);
6498 ret = group_match_null_string_p(&p1, end, register_info);
6500 /* Have to set this here in case we're checking a group which
6501 contains a group and a back reference to it. */
6503 if (REG_MATCH_NULL_STRING_P(register_info[reg_no]) ==
6504 MATCH_NULL_UNSET_VALUE)
6505 REG_MATCH_NULL_STRING_P(register_info[reg_no]) = ret;
6511 /* If this is an optimized succeed_n for zero times, make the jump. */
6513 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6521 /* Get to the number of times to succeed. */
6523 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6527 EXTRACT_NUMBER_AND_INCR(mcnt, p1);
6534 if (!REG_MATCH_NULL_STRING_P(register_info[*p1]))
6542 /* All other opcodes mean we cannot match the empty string. */
6548 } /* common_op_match_null_string_p */
6550 /* Return zero if TRANSLATE[S1] and TRANSLATE[S2] are identical for LEN
6551 bytes; nonzero otherwise. */
6554 bcmp_translate(re_char *s1, re_char *s2,
6555 REGISTER int len, RE_TRANSLATE_TYPE translate)
6557 REGISTER const unsigned char *p1 = s1, *p2 = s2;
6559 const unsigned char *p1_end = s1 + len;
6560 const unsigned char *p2_end = s2 + len;
6562 while (p1 != p1_end && p2 != p2_end) {
6563 Emchar p1_ch, p2_ch;
6565 p1_ch = charptr_emchar(p1);
6566 p2_ch = charptr_emchar(p2);
6568 if (RE_TRANSLATE(p1_ch)
6569 != RE_TRANSLATE(p2_ch))
6574 #else /* not MULE */
6576 if (RE_TRANSLATE(*p1++) != RE_TRANSLATE(*p2++))
6584 /* Entry points for GNU code. */
6586 /* re_compile_pattern is the GNU regular expression compiler: it
6587 compiles PATTERN (of length SIZE) and puts the result in BUFP.
6588 Returns 0 if the pattern was valid, otherwise an error string.
6590 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
6591 are set in BUFP on entry.
6593 We call regex_compile to do the actual compilation. */
6595 const char *re_compile_pattern(const char *pattern, int length,
6596 struct re_pattern_buffer *bufp)
6600 /* GNU code is written to assume at least RE_NREGS registers will be set
6601 (and at least one extra will be -1). */
6602 bufp->regs_allocated = REGS_UNALLOCATED;
6604 /* And GNU code determines whether or not to get register information
6605 by passing null for the REGS argument to re_match, etc., not by
6609 /* Match anchors at newline. */
6610 bufp->newline_anchor = 1;
6612 ret = regex_compile((const unsigned char*)pattern,
6613 length, re_syntax_options, bufp);
6617 return gettext(re_error_msgid[(int)ret]);
6620 /* Entry points compatible with 4.2 BSD regex library. We don't define
6621 them unless specifically requested. */
6623 #ifdef _REGEX_RE_COMP
6625 /* BSD has one and only one pattern buffer. */
6626 static struct re_pattern_buffer re_comp_buf;
6628 char *re_comp(const char *s)
6633 if (!re_comp_buf.buffer)
6634 return gettext("No previous regular expression");
6638 if (!re_comp_buf.buffer) {
6639 re_comp_buf.buffer = (unsigned char *)xmalloc_atomic(200);
6640 if (re_comp_buf.buffer == NULL)
6641 return gettext(re_error_msgid[(int)REG_ESPACE]);
6642 re_comp_buf.allocated = 200;
6644 re_comp_buf.fastmap = (char *)xmalloc_atomic(1 << BYTEWIDTH);
6645 if (re_comp_buf.fastmap == NULL)
6646 return gettext(re_error_msgid[(int)REG_ESPACE]);
6649 /* Since `re_exec' always passes NULL for the `regs' argument, we
6650 don't need to initialize the pattern buffer fields which affect it. */
6652 /* Match anchors at newlines. */
6653 re_comp_buf.newline_anchor = 1;
6656 regex_compile((unsigned char *)s, strlen(s), re_syntax_options,
6662 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
6663 return (char *)gettext(re_error_msgid[(int)ret]);
6666 int re_exec(const char *s)
6668 const int len = strlen(s);
6670 0 <= re_search(&re_comp_buf, s, len, 0, len,
6671 (struct re_registers *)0);
6673 #endif /* _REGEX_RE_COMP */
6675 /* POSIX.2 functions. Don't define these for Emacs. */
6679 /* regcomp takes a regular expression as a string and compiles it.
6681 PREG is a regex_t *. We do not expect any fields to be initialized,
6682 since POSIX says we shouldn't. Thus, we set
6684 `buffer' to the compiled pattern;
6685 `used' to the length of the compiled pattern;
6686 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
6687 REG_EXTENDED bit in CFLAGS is set; otherwise, to
6688 RE_SYNTAX_POSIX_BASIC;
6689 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
6690 `fastmap' and `fastmap_accurate' to zero;
6691 `re_nsub' to the number of subexpressions in PATTERN.
6692 (non-shy of course. POSIX probably doesn't know about
6693 shy ones, and in any case they should be invisible.)
6695 PATTERN is the address of the pattern string.
6697 CFLAGS is a series of bits which affect compilation.
6699 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
6700 use POSIX basic syntax.
6702 If REG_NEWLINE is set, then . and [^...] don't match newline.
6703 Also, regexec will try a match beginning after every newline.
6705 If REG_ICASE is set, then we considers upper- and lowercase
6706 versions of letters to be equivalent when matching.
6708 If REG_NOSUB is set, then when PREG is passed to regexec, that
6709 routine will report only success or failure, and nothing about the
6712 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
6713 the return codes and their meanings.) */
6715 int regcomp(regex_t * preg, const char *pattern, int cflags)
6719 = (cflags & REG_EXTENDED) ?
6720 RE_SYNTAX_POSIX_EXTENDED : RE_SYNTAX_POSIX_BASIC;
6722 /* regex_compile will allocate the space for the compiled pattern. */
6724 preg->allocated = 0;
6727 /* Don't bother to use a fastmap when searching. This simplifies the
6728 REG_NEWLINE case: if we used a fastmap, we'd have to put all the
6729 characters after newlines into the fastmap. This way, we just try
6733 if (cflags & REG_ICASE) {
6736 preg->translate = (char *)xmalloc_atomic(CHAR_SET_SIZE);
6737 if (preg->translate == NULL)
6738 return (int)REG_ESPACE;
6740 /* Map uppercase characters to corresponding lowercase ones. */
6741 for (i = 0; i < CHAR_SET_SIZE; i++)
6742 preg->translate[i] = ISUPPER(i) ? tolower(i) : i;
6744 preg->translate = NULL;
6746 /* If REG_NEWLINE is set, newlines are treated differently. */
6747 if (cflags & REG_NEWLINE) {
6748 /* REG_NEWLINE implies neither . nor [^...] match newline. */
6749 syntax &= ~RE_DOT_NEWLINE;
6750 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
6751 /* It also changes the matching behavior. */
6752 preg->newline_anchor = 1;
6754 preg->newline_anchor = 0;
6756 preg->no_sub = !!(cflags & REG_NOSUB);
6758 /* POSIX says a null character in the pattern terminates it, so we
6759 can use strlen here in compiling the pattern. */
6760 ret = regex_compile((const unsigned char*)pattern,
6761 strlen(pattern), syntax, preg);
6763 /* POSIX doesn't distinguish between an unmatched open-group and an
6764 unmatched close-group: both are REG_EPAREN. */
6765 if (ret == REG_ERPAREN)
6771 /* regexec searches for a given pattern, specified by PREG, in the
6774 If NMATCH is zero or REG_NOSUB was set in the cflags argument to
6775 `regcomp', we ignore PMATCH. Otherwise, we assume PMATCH has at
6776 least NMATCH elements, and we set them to the offsets of the
6777 corresponding matched substrings.
6779 EFLAGS specifies `execution flags' which affect matching: if
6780 REG_NOTBOL is set, then ^ does not match at the beginning of the
6781 string; if REG_NOTEOL is set, then $ does not match at the end.
6783 We return 0 if we find a match and REG_NOMATCH if not. */
6786 regexec(const regex_t * preg, const char *string, Element_count nmatch,
6787 regmatch_t pmatch[], int eflags)
6790 struct re_registers regs;
6791 regex_t private_preg;
6792 int len = strlen(string);
6793 re_bool want_reg_info = !preg->no_sub && nmatch > 0;
6795 private_preg = *preg;
6797 private_preg.not_bol = !!(eflags & REG_NOTBOL);
6798 private_preg.not_eol = !!(eflags & REG_NOTEOL);
6800 /* The user has told us exactly how many registers to return
6801 information about, via `nmatch'. We have to pass that on to the
6802 matching routines. */
6803 private_preg.regs_allocated = REGS_FIXED;
6805 if (want_reg_info) {
6806 regs.num_regs = nmatch;
6807 regs.start = TALLOC(nmatch, regoff_t);
6808 regs.end = TALLOC(nmatch, regoff_t);
6809 if (regs.start == NULL || regs.end == NULL)
6810 return (int)REG_NOMATCH;
6813 /* Perform the searching operation. */
6814 ret = re_search(&private_preg, string, len,
6815 /* start: */ 0, /* range: */ len,
6816 want_reg_info ? ®s : (struct re_registers *)0);
6818 /* Copy the register information to the POSIX structure. */
6819 if (want_reg_info) {
6823 for (r = 0; r < nmatch; r++) {
6824 pmatch[r].rm_so = regs.start[r];
6825 pmatch[r].rm_eo = regs.end[r];
6829 /* If we needed the temporary register info, free the space now. */
6834 /* We want zero return to mean success, unlike `re_search'. */
6835 return ret >= 0 ? (int)REG_NOERROR : (int)REG_NOMATCH;
6838 /* Returns a message corresponding to an error code, ERRCODE, returned
6839 from either regcomp or regexec. We don't use PREG here. */
6842 regerror(int errcode, const regex_t * preg, char *errbuf,
6843 Memory_count errbuf_size)
6846 Memory_count msg_size;
6848 if (errcode < 0 || (size_t) errcode >= (sizeof(re_error_msgid)
6849 / sizeof(re_error_msgid[0])))
6850 /* Only error codes returned by the rest of the code should be passed
6851 to this routine. If we are given anything else, or if other regex
6852 code generates an invalid error code, then the program has a bug.
6853 Dump core so we can fix it. */
6856 msg = gettext(re_error_msgid[errcode]);
6858 msg_size = strlen(msg) + 1; /* Includes the null. */
6860 if (errbuf_size != 0) {
6861 strncpy(errbuf, msg, errbuf_size - 1);
6862 errbuf[errbuf_size-1]='\0';
6868 /* Free dynamically allocated space used by PREG. */
6870 void regfree(regex_t * preg)
6872 if (preg->buffer != NULL) {
6873 xfree(preg->buffer);
6875 preg->buffer = NULL;
6877 preg->allocated = 0;
6880 if (preg->fastmap != NULL) {
6881 xfree(preg->fastmap);
6883 preg->fastmap = NULL;
6884 preg->fastmap_accurate = 0;
6886 if (preg->translate != NULL) {
6887 xfree(preg->translate);
6889 preg->translate = NULL;
6892 #endif /* not emacs */
6896 make-backup-files: t
6898 trim-versions-without-asking: nil