1 ;;; sregex.el --- symbolic regular expressions
3 ;; Copyright (C) 1997, 1998, 2000, 2004 Free Software Foundation, Inc.
5 ;; Author: Bob Glickstein <bobg+sregex@zanshin.com>
6 ;; Maintainer: Bob Glickstein <bobg+sregex@zanshin.com>
7 ;; Keywords: extensions
9 ;; This file is part of GNU Emacs.
11 ;; GNU Emacs is free software; you can redistribute it and/or modify
12 ;; it under the terms of the GNU General Public License as published by
13 ;; the Free Software Foundation; either version 2, or (at your option)
16 ;; GNU Emacs is distributed in the hope that it will be useful,
17 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ;; GNU General Public License for more details.
21 ;; You should have received a copy of the GNU General Public License
22 ;; along with GNU Emacs; see the file COPYING. If not, write to the
23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 ;; Boston, MA 02111-1307, USA.
28 ;; This version of sregex is provided with the JDEE to support
29 ;; users of XEmacs, which does not include sregex in its
32 ;; This package allows you to write regular expressions using a
33 ;; totally new, Lisp-like syntax.
35 ;; A "symbolic regular expression" (sregex for short) is a Lisp form
36 ;; that, when evaluated, produces the string form of the specified
37 ;; regular expression. Here's a simple example:
39 ;; (sregexq (or "Bob" "Robert")) => "Bob\\|Robert"
41 ;; As you can see, an sregex is specified by placing one or more
42 ;; special clauses in a call to `sregexq'. The clause in this case is
43 ;; the `or' of two strings (not to be confused with the Lisp function
44 ;; `or'). The list of allowable clauses appears below.
46 ;; With sregex, it is never necessary to "escape" magic characters
47 ;; that are meant to be taken literally; that happens automatically.
50 ;; (sregexq "M*A*S*H") => "M\\*A\\*S\\*H"
52 ;; It is also unnecessary to "group" parts of the expression together
53 ;; to overcome operator precedence; that also happens automatically.
56 ;; (sregexq (opt (or "Bob" "Robert"))) => "\\(?:Bob\\|Robert\\)?"
58 ;; It *is* possible to group parts of the expression in order to refer
59 ;; to them with numbered backreferences:
61 ;; (sregexq (group (or "Go" "Run"))
63 ;; (backref 1)) => "\\(Go\\|Run\\), Spot, \\1"
65 ;; `sregexq' is a macro. Each time it is used, it constructs a simple
66 ;; Lisp expression that then invokes a moderately complex engine to
67 ;; interpret the sregex and render the string form. Because of this,
68 ;; I don't recommend sprinkling calls to `sregexq' throughout your
69 ;; code, the way one normally does with string regexes (which are
70 ;; cheap to evaluate). Instead, it's wiser to precompute the regexes
71 ;; you need wherever possible instead of repeatedly constructing the
72 ;; same ones over and over. Example:
74 ;; (let ((field-regex (sregexq (opt "resent-")
75 ;; (or "to" "cc" "bcc"))))
79 ;; (re-search-forward field-regex ...)
82 ;; The arguments to `sregexq' are automatically quoted, but the
83 ;; flipside of this is that it is not straightforward to include
84 ;; computed (i.e., non-constant) values in `sregexq' expressions. So
85 ;; `sregex' is a function that is like `sregexq' but which does not
86 ;; automatically quote its values. Literal sregex clauses must be
87 ;; explicitly quoted like so:
89 ;; (sregex '(or "Bob" "Robert")) => "Bob\\|Robert"
91 ;; but computed clauses can be included easily, allowing for the reuse
94 ;; (let ((dotstar '(0+ any))
95 ;; (whitespace '(1+ (syntax ?-)))
96 ;; (digits '(1+ (char (?0 . ?9)))))
97 ;; (sregex 'bol dotstar ":" whitespace digits)) => "^.*:\\s-+[0-9]+"
99 ;; To use this package in a Lisp program, simply (require 'sregex).
101 ;; Here are the clauses allowed in an `sregex' or `sregexq'
105 ;; This stands for the literal string. If it contains
106 ;; metacharacters, they will be escaped in the resulting regex
107 ;; (using `regexp-quote').
109 ;; - the symbol `any'
110 ;; This stands for ".", a regex matching any character except
113 ;; - the symbol `bol'
114 ;; Stands for "^", matching the empty string at the beginning of a line
116 ;; - the symbol `eol'
117 ;; Stands for "$", matching the empty string at the end of a line
119 ;; - (group CLAUSE ...)
120 ;; Groups the given CLAUSEs using "\\(" and "\\)".
122 ;; - (sequence CLAUSE ...)
124 ;; Groups the given CLAUSEs; may or may not use "\\(?:" and "\\)".
125 ;; Clauses grouped by `sequence' do not count for purposes of
126 ;; numbering backreferences. Use `sequence' in situations like
129 ;; (sregexq (or "dog" "cat"
130 ;; (sequence (opt "sea ") "monkey")))
131 ;; => "dog\\|cat\\|\\(?:sea \\)?monkey"
133 ;; where a single `or' alternate needs to contain multiple
137 ;; Matches the same string previously matched by the Nth "group" in
138 ;; the same sregex. N is a positive integer.
141 ;; Matches any one of the CLAUSEs by separating them with "\\|".
144 ;; Concatenates the given CLAUSEs and matches zero or more
145 ;; occurrences by appending "*".
148 ;; Concatenates the given CLAUSEs and matches one or more
149 ;; occurrences by appending "+".
151 ;; - (opt CLAUSE ...)
152 ;; Concatenates the given CLAUSEs and matches zero or one occurrence
155 ;; - (repeat MIN MAX CLAUSE ...)
156 ;; Concatenates the given CLAUSEs and constructs a regex matching at
157 ;; least MIN occurrences and at most MAX occurrences. MIN must be a
158 ;; non-negative integer. MAX must be a non-negative integer greater
159 ;; than or equal to MIN; or MAX can be nil to mean "infinity."
161 ;; - (char CHAR-CLAUSE ...)
162 ;; Creates a "character class" matching one character from the given
163 ;; set. See below for how to construct a CHAR-CLAUSE.
165 ;; - (not-char CHAR-CLAUSE ...)
166 ;; Creates a "character class" matching any one character not in the
167 ;; given set. See below for how to construct a CHAR-CLAUSE.
169 ;; - the symbol `bot'
170 ;; Stands for "\\`", matching the empty string at the beginning of
171 ;; text (beginning of a string or of a buffer).
173 ;; - the symbol `eot'
174 ;; Stands for "\\'", matching the empty string at the end of text.
176 ;; - the symbol `point'
177 ;; Stands for "\\=", matching the empty string at point.
179 ;; - the symbol `word-boundary'
180 ;; Stands for "\\b", matching the empty string at the beginning or
183 ;; - the symbol `not-word-boundary'
184 ;; Stands for "\\B", matching the empty string not at the beginning
187 ;; - the symbol `bow'
188 ;; Stands for "\\<", matching the empty string at the beginning of a
191 ;; - the symbol `eow'
192 ;; Stands for "\\>", matching the empty string at the end of a word.
194 ;; - the symbol `wordchar'
195 ;; Stands for the regex "\\w", matching a word-constituent character
196 ;; (as determined by the current syntax table)
198 ;; - the symbol `not-wordchar'
199 ;; Stands for the regex "\\W", matching a non-word-constituent
203 ;; Stands for the regex "\\sCODE", where CODE is a syntax table code
204 ;; (a single character). Matches any character with the requested
207 ;; - (not-syntax CODE)
208 ;; Stands for the regex "\\SCODE", where CODE is a syntax table code
209 ;; (a single character). Matches any character without the
213 ;; This is a "trapdoor" for including ordinary regular expression
214 ;; strings in the result. Some regular expressions are clearer when
215 ;; written the old way: "[a-z]" vs. (sregexq (char (?a . ?z))), for
216 ;; instance. However, see the note under "Bugs," below.
218 ;; Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...)
219 ;; has one of the following forms:
222 ;; Adds that character to the set.
225 ;; Adds all the characters in the string to the set.
227 ;; - A pair (MIN . MAX)
228 ;; Where MIN and MAX are characters, adds the range of characters
229 ;; from MIN through MAX to the set.
233 ;; An earlier version of this package could optionally translate the
234 ;; symbolic regex into other languages' syntaxes, e.g. Perl. For
235 ;; instance, with Perl syntax selected, (sregexq (or "ab" "cd")) would
236 ;; yield "ab|cd" instead of "ab\\|cd". It might be useful to restore
239 ;; - handle multibyte chars in sregex--char-aux
240 ;; - add support for character classes ([:blank:], ...)
241 ;; - add support for non-greedy operators *? and +?
242 ;; - bug: (sregexq (opt (opt ?a))) returns "a??" which is a non-greedy "a?"
248 (eval-when-compile (require 'cl))
250 ;; XEmacs does not hava make-bool-vector.
251 (unless (fboundp 'make-bool-vector)
252 (defalias 'make-bool-vector 'make-vector))
255 ;; Compatibility code for when we didn't have shy-groups
256 (defvar sregex--current-sregex nil)
257 (defun sregex-info () nil)
258 (defmacro sregex-save-match-data (&rest forms) (cons 'save-match-data forms))
259 (defun sregex-replace-match (r &optional f l str subexp x)
260 (replace-match r f l str subexp))
261 (defun sregex-match-string (c &optional i x) (match-string c i))
262 (defun sregex-match-string-no-properties (count &optional in-string sregex)
263 (match-string-no-properties count in-string))
264 (defun sregex-match-beginning (count &optional sregex) (match-beginning count))
265 (defun sregex-match-end (count &optional sregex) (match-end count))
266 (defun sregex-match-data (&optional sregex) (match-data))
267 (defun sregex-backref-num (n &optional sregex) n)
270 (defun sregex (&rest exps)
271 "Symbolic regular expression interpreter.
272 This is exactly like `sregexq' (q.v.) except that it evaluates all its
273 arguments, so literal sregex clauses must be quoted. For example:
275 (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
277 An argument-evaluating sregex interpreter lets you reuse sregex
280 (let ((dotstar '(0+ any))
281 (whitespace '(1+ (syntax ?-)))
282 (digits '(1+ (char (?0 . ?9)))))
283 (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\""
284 (sregex--sequence exps nil))
286 (defmacro sregexq (&rest exps)
287 "Symbolic regular expression interpreter.
288 This macro allows you to specify a regular expression (regexp) in
289 symbolic form, and converts it into the string form required by Emacs's
290 regex functions such as `re-search-forward' and `looking-at'. Here is
293 (sregexq (or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
295 As you can see, an sregex is specified by placing one or more special
296 clauses in a call to `sregexq'. The clause in this case is the `or'
297 of two strings (not to be confused with the Lisp function `or'). The
298 list of allowable clauses appears below.
300 With `sregex', it is never necessary to \"escape\" magic characters
301 that are meant to be taken literally; that happens automatically.
304 (sregexq \"M*A*S*H\") => \"M\\\\*A\\\\*S\\\\*H\"
306 It is also unnecessary to \"group\" parts of the expression together
307 to overcome operator precedence; that also happens automatically.
310 (sregexq (opt (or \"Bob\" \"Robert\"))) => \"\\\\(Bob\\\\|Robert\\\\)?\"
312 It *is* possible to group parts of the expression in order to refer
313 to them with numbered backreferences:
315 (sregexq (group (or \"Go\" \"Run\"))
317 (backref 1)) => \"\\\\(Go\\\\|Run\\\\), Spot, \\\\1\"
319 If `sregexq' needs to introduce its own grouping parentheses, it will
320 automatically renumber your backreferences:
322 (sregexq (opt \"resent-\")
323 (group (or \"to\" \"cc\" \"bcc\"))
325 (backref 1)) => \"\\\\(resent-\\\\)?\\\\(to\\\\|cc\\\\|bcc\\\\): \\\\2\"
327 `sregexq' is a macro. Each time it is used, it constructs a simple
328 Lisp expression that then invokes a moderately complex engine to
329 interpret the sregex and render the string form. Because of this, I
330 don't recommend sprinkling calls to `sregexq' throughout your code,
331 the way one normally does with string regexes (which are cheap to
332 evaluate). Instead, it's wiser to precompute the regexes you need
333 wherever possible instead of repeatedly constructing the same ones
334 over and over. Example:
336 (let ((field-regex (sregexq (opt \"resent-\")
337 (or \"to\" \"cc\" \"bcc\"))))
341 (re-search-forward field-regex ...)
344 The arguments to `sregexq' are automatically quoted, but the
345 flipside of this is that it is not straightforward to include
346 computed (i.e., non-constant) values in `sregexq' expressions. So
347 `sregex' is a function that is like `sregexq' but which does not
348 automatically quote its values. Literal sregex clauses must be
349 explicitly quoted like so:
351 (sregex '(or \"Bob\" \"Robert\")) => \"Bob\\\\|Robert\"
353 but computed clauses can be included easily, allowing for the reuse
356 (let ((dotstar '(0+ any))
357 (whitespace '(1+ (syntax ?-)))
358 (digits '(1+ (char (?0 . ?9)))))
359 (sregex 'bol dotstar \":\" whitespace digits)) => \"^.*:\\\\s-+[0-9]+\"
361 Here are the clauses allowed in an `sregex' or `sregexq' expression:
364 This stands for the literal string. If it contains
365 metacharacters, they will be escaped in the resulting regex
366 (using `regexp-quote').
369 This stands for \".\", a regex matching any character except
373 Stands for \"^\", matching the empty string at the beginning of a line
376 Stands for \"$\", matching the empty string at the end of a line
379 Groups the given CLAUSEs using \"\\\\(\" and \"\\\\)\".
381 - (sequence CLAUSE ...)
383 Groups the given CLAUSEs; may or may not use \"\\\\(\" and \"\\\\)\".
384 Clauses grouped by `sequence' do not count for purposes of
385 numbering backreferences. Use `sequence' in situations like
388 (sregexq (or \"dog\" \"cat\"
389 (sequence (opt \"sea \") \"monkey\")))
390 => \"dog\\\\|cat\\\\|\\\\(?:sea \\\\)?monkey\"
392 where a single `or' alternate needs to contain multiple
396 Matches the same string previously matched by the Nth \"group\" in
397 the same sregex. N is a positive integer.
400 Matches any one of the CLAUSEs by separating them with \"\\\\|\".
403 Concatenates the given CLAUSEs and matches zero or more
404 occurrences by appending \"*\".
407 Concatenates the given CLAUSEs and matches one or more
408 occurrences by appending \"+\".
411 Concatenates the given CLAUSEs and matches zero or one occurrence
414 - (repeat MIN MAX CLAUSE ...)
415 Concatenates the given CLAUSEs and constructs a regex matching at
416 least MIN occurrences and at most MAX occurrences. MIN must be a
417 non-negative integer. MAX must be a non-negative integer greater
418 than or equal to MIN; or MAX can be nil to mean \"infinity.\"
420 - (char CHAR-CLAUSE ...)
421 Creates a \"character class\" matching one character from the given
422 set. See below for how to construct a CHAR-CLAUSE.
424 - (not-char CHAR-CLAUSE ...)
425 Creates a \"character class\" matching any one character not in the
426 given set. See below for how to construct a CHAR-CLAUSE.
429 Stands for \"\\\\`\", matching the empty string at the beginning of
430 text (beginning of a string or of a buffer).
433 Stands for \"\\\\'\", matching the empty string at the end of text.
436 Stands for \"\\\\=\", matching the empty string at point.
438 - the symbol `word-boundary'
439 Stands for \"\\\\b\", matching the empty string at the beginning or
442 - the symbol `not-word-boundary'
443 Stands for \"\\\\B\", matching the empty string not at the beginning
447 Stands for \"\\\\\\=<\", matching the empty string at the beginning of a
451 Stands for \"\\\\\\=>\", matching the empty string at the end of a word.
453 - the symbol `wordchar'
454 Stands for the regex \"\\\\w\", matching a word-constituent character
455 (as determined by the current syntax table)
457 - the symbol `not-wordchar'
458 Stands for the regex \"\\\\W\", matching a non-word-constituent
462 Stands for the regex \"\\\\sCODE\", where CODE is a syntax table code
463 (a single character). Matches any character with the requested
467 Stands for the regex \"\\\\SCODE\", where CODE is a syntax table code
468 (a single character). Matches any character without the
472 This is a \"trapdoor\" for including ordinary regular expression
473 strings in the result. Some regular expressions are clearer when
474 written the old way: \"[a-z]\" vs. (sregexq (char (?a . ?z))), for
477 Each CHAR-CLAUSE that is passed to (char ...) and (not-char ...)
478 has one of the following forms:
481 Adds that character to the set.
484 Adds all the characters in the string to the set.
487 Where MIN and MAX are characters, adds the range of characters
488 from MIN through MAX to the set."
489 `(apply 'sregex ',exps))
491 (defun sregex--engine (exp combine)
497 (concat "\\(?:" (regexp-quote exp) "\\)")
509 (word-boundary "\\b")
510 (not-word-boundary "\\B")
514 (funcall (intern (concat "sregex--"
515 (symbol-name (car exp))))
518 (t (error "Invalid expression: %s" exp))))
520 (defun sregex--sequence (exps combine)
521 (if (= (length exps) 1) (sregex--engine (car exps) combine)
523 (lambda (e) (sregex--engine e 'concat))
525 (if (eq combine 'suffix)
526 (concat "\\(?:" re "\\)")
529 (defun sregex--or (exps combine)
530 (if (= (length exps) 1) (sregex--engine (car exps) combine)
532 (lambda (e) (sregex--engine e 'or))
534 (if (not (eq combine 'or))
535 (concat "\\(?:" re "\\)")
538 (defun sregex--group (exps combine) (concat "\\(" (sregex--sequence exps nil) "\\)"))
540 (defun sregex--backref (exps combine) (concat "\\" (int-to-string (car exps))))
541 (defun sregex--opt (exps combine) (concat (sregex--sequence exps 'suffix) "?"))
542 (defun sregex--0+ (exps combine) (concat (sregex--sequence exps 'suffix) "*"))
543 (defun sregex--1+ (exps combine) (concat (sregex--sequence exps 'suffix) "+"))
545 (defun sregex--char (exps combine) (sregex--char-aux nil exps))
546 (defun sregex--not-char (exps combine) (sregex--char-aux t exps))
548 (defun sregex--syntax (exps combine) (format "\\s%c" (car exps)))
549 (defun sregex--not-syntax (exps combine) (format "\\S%c" (car exps)))
551 (defun sregex--regex (exps combine)
552 (if combine (concat "\\(?:" (car exps) "\\)") (car exps)))
554 (defun sregex--repeat (exps combine)
555 (let* ((min (or (pop exps) 0))
556 (minstr (number-to-string min))
558 (concat (sregex--sequence exps 'suffix)
559 (concat "\\{" minstr ","
560 (when max (number-to-string max)) "\\}"))))
562 (defun sregex--char-range (start end)
563 (let ((startc (char-to-string start))
564 (endc (char-to-string end)))
566 ((> end (+ start 2)) (concat startc "-" endc))
567 ((> end (+ start 1)) (concat startc (char-to-string (1+ start)) endc))
568 ((> end start) (concat startc endc))
571 (defun sregex--char-aux (complement args)
572 ;; regex-opt does the same, we should join effort.
573 (let ((chars (make-bool-vector 256 nil))) ; Yeah, right!
575 (cond ((integerp arg) (aset chars arg t))
576 ((stringp arg) (mapcar (lambda (c) (aset chars c t)) arg))
578 (let ((start (car arg))
581 (let ((tmp start)) (setq start end) (setq end tmp)))
586 (setq i (1+ i))))))))
587 ;; now chars is a map of the characters in the class
588 (let ((caret (aref chars ?^))
589 (dash (aref chars ?-))
590 (class (if (aref chars ?\]) "]" "")))
599 (unless start (setq start i))
603 (setq class (concat class (sregex--char-range start end)))
606 (setq class (concat class (sregex--char-range start end)))))
608 (if (> (length class) 0)
609 (setq class (concat class (if caret "^") (if dash "-")))
610 (setq class (concat class (if dash "-") (if caret "^"))))
611 (if (and (not complement) (= (length class) 1))
613 (concat "[" (if complement "^") class "]")))))
617 ;;; sregex.el ends here