1 ;;; w3-parse.el --- Parse HTML and/or SGML for Emacs W3 browser
3 ;; Author: Joe Wells <jbw@cs.bu.edu>
4 ;; Created on: Sat Sep 30 17:25:40 1995
6 ;; Copyright © 1995, 1996, 1997 Joseph Brian Wells
7 ;; Copyright © 1993, 1994, 1995 by William M. Perry <wmperry@cs.indiana.edu>
9 ;; This program is free software; you can redistribute it and/or modify
10 ;; it under the terms of the GNU General Public License as published by
11 ;; the Free Software Foundation; either version 2 of the License, or
12 ;; (at your option) any later version.
14 ;; This program is distributed in the hope that it will be useful,
15 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;; GNU General Public License for more details.
19 ;; You should have received a copy of the GNU General Public License
20 ;; along with GNU Emacs; see the file COPYING. If not, write to the
21 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 ;; Boston, MA 02111-1307, USA.
26 ;;; Trying to make the best of an evil speed hack.
31 ;; Basically, this file provides one big function (w3-parse-buffer) and
32 ;; some data structures. However, to avoid code redundancy, I have broken
33 ;; out some common subexpressions of w3-parse-buffer into separate
34 ;; functions. I have declared these separate functions with "defsubst" so
35 ;; they will be inlined into w3-parse-buffer. Also, I have defined them
36 ;; within eval-when-compile forms, so no definitions will be emitted into
37 ;; the .elc file for these separate functions. (They will work normally
38 ;; when the uncompiled file is loaded.)
40 ;; Each of these subfunctions use some scratch variables in a purely local
41 ;; fashion. In good software design, I would declare these variables as
42 ;; close to their use as possible with "let". However, "let"-binding
43 ;; variables is *SLOW* in Emacs Lisp, even when compiled. Since each of
44 ;; these functions is executed one or more time during each iteration of
45 ;; the main loop, I deemed this too expensive. So the main function does
46 ;; the "let"-binding of these variables. However, I still want to declare
47 ;; them close to their use, partially to keep the compiler from crying
48 ;; "Wolf!" when there is no danger (well, maybe a little danger :-), so I
49 ;; define some macros for this purpose.
51 ;; Also, there are some variables which are updated throughout the file
52 ;; (remember this is really all one function). Some of the code which
53 ;; updates them is located inside the subfunctions. So that the compiler
54 ;; will not complain, these variables are defined with defvar.
58 ; (require 'url-history)
59 (autoload 'url-expand-file-name "url-expand")
61 (eval-when-compile (require 'cl))
64 (defconst w3-p-s-var-list nil
65 "A list of the scratch variables used by functions called by
66 w3-parse-buffer which it is w3-parse-buffer's responsibility to
69 (defmacro w3-p-s-var-def (var)
70 "Declare VAR as a scratch variable which w3-parse-buffer must
74 (or (memq ',var w3-p-s-var-list)
75 (setq w3-p-s-var-list (cons ',var w3-p-s-var-list)))))
77 (defmacro w3-p-s-let-bindings (&rest body)
78 "\"let\"-bind all of the variables in w3-p-s-var-list in BODY."
79 `(let ,w3-p-s-var-list
81 (put 'w3-p-s-let-bindings 'lisp-indent-function 0)
82 (put 'w3-p-s-let-bindings 'edebug-form-spec t)
84 (defvar w3-p-d-current-element)
85 (put 'w3-p-d-current-element 'variable-documentation
86 "Information structure for the current open element.")
88 (defvar w3-p-d-exceptions)
89 (put 'w3-p-d-exceptions 'variable-documentation
90 "Alist specifying elements (dis)allowed because of an (ex|in)clusion
91 exception of some containing element (not necessarily the immediately
92 containing element). Each item specifies a transition for an element
93 which overrides that specified by the current element's content model.
94 Each item is of the form (TAG ACTION *same ERRORP).")
96 (defvar w3-p-d-in-parsed-marked-section)
97 (put 'w3-p-d-in-parsed-marked-section 'variable-documentation
98 "Are we in a parsed marked section so that we have to scan for \"]]>\"?")
100 (defvar w3-p-d-non-markup-chars)
101 (put 'w3-p-d-non-markup-chars 'variable-documentation
102 "The characters that do not indicate the start of markup, in the format
103 for an argument to skip-chars-forward.")
105 (defvar w3-p-d-null-end-tag-enabled)
106 (put 'w3-p-d-null-end-tag-enabled 'variable-documentation
107 "Is the null end tag (\"/\") enabled?")
109 (defvar w3-p-d-open-element-stack)
110 (put 'w3-p-d-open-element-stack 'variable-documentation
111 "A stack of the currently open elements, with the innermost enclosing
112 element on top and the outermost on bottom.")
114 (defvar w3-p-d-shortrefs)
115 (put 'w3-p-d-shortrefs 'variable-documentation
116 "An alist of the magic entity reference strings in the current
117 between-tags region and their replacements. Each item is of the format
118 \(REGEXP . REPLACEMENT-STRING\). Although in SGML shortrefs normally name
119 entities whose value should be used as the replacement, we have
120 preexpanded the entities for speed. We have also regexp-quoted the
121 strings to be replaced, so they can be used with looking-at. This should
122 never be in an element's overrides field unless
123 w3-p-d-shortref-chars is also in the field.")
125 (defvar w3-p-d-shortref-chars)
126 (put 'w3-p-d-shortref-chars 'variable-documentation
127 "A string of the characters which can start shortrefs in the current
128 between-tags region. This must be in a form which can be passed to
129 skip-chars-forward and must contain exactly the characters which start the
130 entries in w3-p-d-shortrefs. If this variable is mentioned in the
131 overrides field of an element, its handling is magical in that the
132 variable w3-p-d-non-markup-chars is saved to the element's undo-list and
133 updated at the same time. This should never be in an element's overrides
134 field unless w3-p-d-shortrefs is also in the field.")
136 (defvar w3-p-d-tag-name)
137 (put 'w3-p-d-tag-name 'variable-documentation
138 "Name of tag we are looking at, as an Emacs Lisp symbol.
139 Only non-nil when we are looking at a tag.")
141 (defvar w3-p-d-end-tag-p)
142 (put 'w3-p-d-end-tag-p 'variable-documentation
143 "Is the tag we are looking at an end tag?
144 Only non-nil when we are looking at a tag.")
150 ;;; HTML syntax error messages.
155 (defvar w3-p-d-debug-url)
156 (put 'w3-p-d-debug-url 'variable-documentation
157 "Whether to print the URL being parsed before an error messages.
158 Only true for the first error message.")
160 ;; The level parameter indicates whether the error is (1) very
161 ;; serious, must be displayed to all users, (2) invalid HTML, but the
162 ;; user should only be told if the user has indicated interest, or (3)
163 ;; valid HTML which is bad because it appears to rely on the way certain
164 ;; browsers will display it, which should only be displayed to the user
165 ;; if they have really asked for it.
167 (defmacro w3-debug-html (&rest body)
168 "Emit a warning message.
169 These keywords may be used at the beginning of the arguments:
170 :mandatory-if sexp -- force printing if sexp evaluates non-nil.
171 :bad-style -- do not print unless w3-debug-html is 'style.
172 :outer -- do not include the current element in the element
174 :nocontext -- do not include context where error detected.
175 The remaining parameters are treated as the body of a progn, the value of
176 which must be a string to use as the error message."
177 (let (mandatory-if bad-style outer nocontext condition)
178 (while (memq (car body) '(:mandatory-if :bad-style :outer :nocontext))
179 (cond ((eq ':mandatory-if (car body))
180 (setq mandatory-if (car (cdr body)))
181 (setq body (cdr (cdr body))))
182 ((eq ':bad-style (car body))
184 (setq body (cdr body)))
185 ((eq ':nocontext (car body))
187 (setq body (cdr body)))
188 ((eq ':outer (car body))
190 (setq body (cdr body)))))
191 (setq condition (if bad-style
192 '(eq 'style w3-debug-html)
199 (let ((message (progn ,@body)))
201 (w3-debug-html-aux message
203 (list outer nocontext)
204 (if outer '(t)))))))))
206 ;; This is unsatisfactory.
207 (put 'w3-debug-html 'lisp-indent-function 0)
209 (put 'w3-debug-html 'edebug-form-spec
210 '([&rest &or ":nocontext" ":outer" [":mandatory-if" form] ":bad-style"]
214 (defun w3-debug-html-aux (message &optional outer nocontext)
218 ;; Display context information for each error
220 "\n Containing elements: "
221 (w3-open-elements-string (if outer 1))
223 "\n Text around error: "
231 (max (- (point) 27) (point-min))
232 (min (+ (point) 20) (point-max))))
233 (delete-char -7))))))) w3-current-badhtml))
235 (defun w3-quote-for-string (string)
237 (set-buffer (get-buffer-create " w3-quote-whitespace"))
240 (goto-char (point-min))
243 (skip-chars-forward "^\"\\\t\n\r")
245 (insert "\\" (cdr (assq (char-after (point)) '((?\" . "\"")
256 ;;; General entity references and numeric character references.
259 ;; *** I18N HTML support?
261 ;; It's perhaps better to use a suitable display table for these
263 (defconst w3-invalid-sgml-char-replacement
264 `((128 "euro" 8364) ;; U+20AC EURO SIGN
265 (130 "," 8218) ;; U+201A SINGLE LOW-9 QUOTATION MARK
266 (131 "_f" 402) ;; U+0192 LATIN SMALL LETTER F WITH HOOK
267 (132 ",,"8222) ;; U+201E DOUBLE LOW-9 QUOTATION MARK
268 (133 "..." 8230) ;; U+2026 HORIZONTAL ELLIPSIS
269 (134 "(dagger)" 8224) ;; U+2020 DAGGER
270 (135 "(double dagger)" 8225) ;; U+2021 DOUBLE DAGGER
271 (136 ?^ 710) ;; U+02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
272 (137 "%o" 8240) ;; U+2030 PER MILLE SIGN
273 (138 "S\\v" 352) ;; U+0160 LATIN CAPITAL LETTER S WITH CARON
274 (139 ?\< 8249) ;; U+2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
275 (140 "OE" 338) ;; U+0152 LATIN CAPITAL LIGATURE OE
276 (142 "Z\\v" 381) ;; U+017D LATIN CAPITAL LETTER Z WITH CARON
277 (145 ?\` 8216) ;; U+2018 LEFT SINGLE QUOTATION MARK
278 (146 ?\' 8217) ;; U+2019 RIGHT SINGLE QUOTATION MARK
279 (147 "``" 8220) ;; U+201C LEFT DOUBLE QUOTATION MARK
280 (148 "''" 8221) ;; U+201D RIGHT DOUBLE QUOTATION MARK
281 (149 ?o 8226) ;; U+2022 BULLET
282 (150 ?- 8211) ;; U+2013 EN DASH
283 (151 "--" 8212) ;; U+2014 EM DASH
284 (152 ?~ 732) ;; U+02DC SMALL TILDE
285 (153 "(TM)" 8482) ;; U+2122 TRADE MARK SIGN
286 (154 "s\\v" 353) ;; U+0161 LATIN SMALL LETTER S WITH CARON
287 (155 ?\> 8250) ;; U+203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
288 (156 "oe" 339) ;; U+0153 LATIN SMALL LIGATURE OE
289 (158 "z\\v" 382) ;; U+017E LATIN SMALL LETTER Z WITH CARON
290 (159 "Y\\.." 376) ;; U+0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
292 "Replacements for SGML numeric character references between 128 and 159.
293 \(Such entities are not valid graphic charcters and are assumed to
294 come from the cp1252 character set rather than Unicode.) This is an
295 alist indexed by numeric code. The cdr of each element is a list of
296 an ASCII substitute and the Unicode for the cp1252 character.")
299 (if (fboundp 'int-to-char) ; XEmacs
300 (defun w3-int-to-char (c)
308 (defalias 'w3-int-to-char 'identity)))
310 ;; For older Mule-UCS. This is from Mule-UCS 0.84.
311 (if (and (not (fboundp 'decode-char))
312 (fboundp 'mucs-get-representation-decoding-backend))
313 (defun decode-char (representation object &optional restriction)
314 "Return a character represented by OBJECT in view of REPRESENTATION.
315 Return nil if OBJECT cannot be mapped to only one character.
316 Available representation list can be obtained by mucs-representation-list.
317 Optional argument RESTRICTION specifies a way to map OBJECT to
318 a character. Its interpretation depends on the given
319 REPRESENTATION. If not specified, the default restriction of REPRESENTATION
321 (let ((fs (mucs-get-representation-decoding-backend
322 representation restriction))
329 representation object restriction))))
333 (defun w3-resolve-numeric-char (code)
334 "Return a representation of the numeric character reference CODE.
335 This may be a string or a character. CODE is always interpreted as a
336 Unicode. A Unicode character is returned if function `decode-char' is
337 available. Codes in the range [128,160] are substituted using
338 `w3-invalid-sgml-char-replacement'."
339 ;; Maybe fall back to something like `(format "&%d;" code)' instead
341 (if (fboundp 'decode-char)
342 (progn (if (and (< code 160) (> code 128))
344 (or (nth 2 (assq code w3-invalid-sgml-char-replacement))
346 (or (decode-char 'ucs code) ?~))
347 (w3-int-to-char (cond ((<= code 127)
350 (if (fboundp 'make-char)
351 (make-char 'latin-iso8859-1 (- code 128))
355 (let ((html-entities w3-html-entities))
357 (put (car (car html-entities)) 'html-entity-expansion
358 (cons 'CDATA (if (integerp (cdr (car html-entities)))
359 (let ((ent (w3-resolve-numeric-char
360 (cdr (car html-entities)))))
361 (unless (stringp ent)
362 (char-to-string ent)))
363 (cdr (car html-entities)))))
364 (setq html-entities (cdr html-entities))))
366 ;; These are the general entities in HTML 3.0 in terms of which the math
367 ;; shortrefs are defined:
369 ;; <!ENTITY REF1 STARTTAG "SUP">
370 ;; <!ENTITY REF2 ENDTAG "SUP">
371 ;; <!ENTITY REF3 STARTTAG "SUB">
372 ;; <!ENTITY REF4 ENDTAG "SUB">
373 ;; <!ENTITY REF5 STARTTAG "BOX">
374 ;; <!ENTITY REF6 ENDTAG "BOX">
376 ;; We're ignoring them because these names should really be local to the
377 ;; DTD and not visible in the document. They might change at any time in
378 ;; future HTML standards.
380 ;; <!--Entities for language-dependent presentation (BIDI and contextual analysis) -->
381 ;; <!ENTITY zwnj CDATA "‌"-- zero width non-joiner-->
382 ;; <!ENTITY zwj CDATA "‍"-- zero width joiner-->
383 ;; <!ENTITY lrm CDATA "‎"-- left-to-right mark-->
384 ;; <!ENTITY rlm CDATA "‏"-- right-to-left mark-->
386 ;; Entity names are case sensitive!
388 ;; & should only be recognized when followed by letter or # and
389 ;; digit or # and letter.
391 (eval-when-compile (defvar w3-invalid-sgml-char-replacement))
394 (w3-p-s-var-def w3-p-s-entity)
395 (w3-p-s-var-def w3-p-s-pos)
396 (w3-p-s-var-def w3-p-s-num)
397 ;; Destroys free variables:
398 ;; w3-p-s-entity, w3-p-s-pos, w3-p-s-num
399 ;; Depends on case-fold-search being t.
400 (defsubst w3-expand-entity-at-point-maybe ()
401 ;; We are looking at a &.
402 ;; Only &A or  or &#A syntax is special.
404 ((and (looking-at "&\\([a-z][-a-z0-9.]*\\)[\ ;\n]?") ; \n should be \r
405 ;; We are looking at a general entity reference, maybe undefined.
408 (intern (buffer-substring (match-beginning 1) (match-end 1)))
409 'html-entity-expansion)))
411 ;; If the reference was undefined, then for SGML, we should really
412 ;; issue a warning and delete the reference. However, the HTML
413 ;; standard (contradicting the SGML standard) says to leave the
414 ;; undefined reference in the text.
416 ;; We are looking at a defined general entity reference.
418 (cond ((eq 'CDATA (car w3-p-s-entity))
419 ;; Leave point after expansion so we don't rescan it.
420 (insert (cdr w3-p-s-entity)))
421 ((memq (car w3-p-s-entity) '(nil STARTTAG ENDTAG MS MD))
422 ;; nil is how I mark ordinary entities.
423 ;; The replacement text gets rescanned for all of these.
424 (setq w3-p-s-pos (point))
425 (insert (cdr (assq (car w3-p-s-entity)
432 (cdr (assq (car w3-p-s-entity)
438 (goto-char w3-p-s-pos)
439 ;; *** Strictly speaking, if we parse anything from the
440 ;; replacement text, it must end before the end of the
443 ((eq 'SDATA (car w3-p-s-entity))
444 (insert "[Unimplemented SDATA \"%s\"]" (cdr w3-p-s-entity)))
445 ((eq 'PI (car w3-p-s-entity))
446 ;; We are currently ignoring processing instructions.
447 ;; *** Strictly speaking, we should issue a warning if this
448 ;; occurs in a attribute value.
451 ;; *** We don't handle external entities yet.
452 (error "[Unimplemented entity: \"%s\"]" w3-p-s-entity))))
454 ;;; What was this regexp supposed to be?
455 ;;; ((looking-at "&#[0-9][0-9]*\\([\ ;\n]?\\)") ; \n should be \r
456 ((looking-at "&#[0-9]+\\([ ;\n]?\\)") ; \n should be \r
457 ;; We are looking at a numeric character reference.
458 ;; Ensure the number is already terminated by a semicolon or carriage
459 ;; return so we can use "read" to get it as a number quickly.
460 (cond ((= (match-beginning 1) (match-end 1))
461 ;; This is very uncommon, so we don't have to be quick here but
464 (goto-char (match-end 0)) ; same as match-end 1
466 ;; Set up the match data properly
467 (looking-at "&#[0-9]+;")))
469 (setq w3-p-s-num (read (current-buffer)))
470 ;; Always leave point after the expansion of a numeric character
471 ;; reference, like it were a CDATA entity. Don't zap a
472 ;; delimiter other than `;'.
473 (if (eq ?\; (char-before (match-end 0)))
475 (replace-match (match-string 1))
477 ;; The condition-case is probably not necessary now.
479 (insert (w3-resolve-numeric-char w3-p-s-num))
480 (error (insert "~"))))
481 ((looking-at "&#x\\([0-9a-f]+\\)\\([ ;\n]?\\)")
482 ;; Similarly to above, but for hex numbers.
483 (cond ((= (match-beginning 2) (match-end 2))
485 (goto-char (match-end 0))
487 (looking-at "&#x[0-9a-f]+;")))
488 (setq w3-p-s-num (string-to-number (match-string 1) 16))
489 (if (eq ?\; (char-before (match-end 0)))
491 (replace-match (match-string 2))
494 (insert (w3-resolve-numeric-char w3-p-s-num))
495 (error (insert "~"))))
496 ((looking-at "&#\\(re\\|rs\\|space\\|tab\\)[\ ;\n]?") ; \n should be \r
497 (replace-match (assq (upcase (char-after (+ 3 (point))))
498 '(;; *** Strictly speaking, record end should be
501 ;; *** And record start should be line feed.
505 ;; Leave point after the expansion of a character reference, so it
506 ;; doesn't get rescanned.
507 ;; *** Strictly speaking, we should issue a warning for &#foo; if foo
508 ;; is not a function character in the SGML declaration.
511 ((eq ?& (char-after (point)))
512 ;; We are either looking at an undefined reference or a & that does
513 ;; not start a reference (in which case we should not have been called).
518 ;; What is the code doing calling us if we're not looking at a "&"?
519 (error "this should never happen"))))
525 ;;; Syntax table used in markup declarations.
528 (defvar w3-sgml-md-syntax-table
529 (let ((table (make-syntax-table))
531 (0 "." 255) ; clear everything
548 ;; "-" can be a character in a NAME, but it is also used in
549 ;; "--" as both a comment start and end within SGML
550 ;; declarations ("<!" ... ">"). In HTML, it is only used
551 ;; as a NAME character in the parameter entities
552 ;; Content-Type, HTTP-Method, and style-notations and in
553 ;; the attribute name http-equiv and in the notation names
554 ;; dsssl-lite and w3c-style. We would like to be able to
555 ;; train Emacs to skip over these kinds of comments with
556 ;; forward-sexp and backward-sexp. Is there any way to
557 ;; teach Emacs how to do this? It doesn't seem to be the
562 (let* ((item (car items))
564 (syntax (car (cdr item)))
565 (bound (or (car-safe (cdr-safe (cdr item)))
567 (while (<= char bound)
568 (modify-syntax-entry char syntax table)
569 (setq char (1+ char))))
570 (setq items (cdr items)))
572 "A syntax table for parsing SGML markup declarations.")
576 ;;; Element information data type.
579 ;; The element information data type is used in two ways:
581 ;; * To store the DTD, there is one element record for each element in
584 ;; * To store information for open elements in the current parse tree.
585 ;; Each such element is initialized by copying the element record
586 ;; from the DTD. This means that values in the fields can not be
587 ;; destructively altered, although of course the fields can be
590 ;; The cells in this vector are:
592 ;; name: the element's name (a generic identifier).
594 ;; end-tag-name: a symbol whose name should be the result of prefixing
595 ;; the generic-identifier with a slash. This is a convenience value for
596 ;; interfacing with the display engine which expects a stream of start
597 ;; and end tags in this format rather than a tree.
599 ;; content-model: a data structure describing what elements or character
600 ;; data we expect to find within this element. This is either a symbol
603 ;; EMPTY: no content, no end-tag allowed.
604 ;; CDATA: all data characters until "</[a-z]" is seen.
605 ;; XCDATA: special non-SGML-standard mode which includes all data
606 ;; characters until "</foo" is seen where "foo" is the name of this
607 ;; element. (for XMP and LISTING)
608 ;; XXCDATA: special non-SGML-standard mode which includes all data
609 ;; until end-of-entity (end-of-buffer for us). (for PLAINTEXT)
610 ;; RCDATA: all data characters until "</[a-z]" is seen, except that
611 ;; entities are expanded first, although the expansions are not
612 ;; scanned for end-tags.
613 ;; XINHERIT: special non-SGML-standard mode which means to use the
614 ;; content model of the containing element instead.
616 ;; or a vector of this structure:
618 ;; [(INCLUDES INCSPACEP (((TAG ...) . TRANSITION) ...) DEFAULT) ...]
620 ;; where INCLUDES is of the format:
624 ;; where each TRANSITION is one of these:
626 ;; (ACTION NEW-STATE ERRORP)
627 ;; (ACTION NEW-STATE)
630 ;; where DEFAULT is one of these:
634 ;; where the meaning of the components is:
636 ;; INCLUDES is a list of tags for which the transition (*include *same
639 ;; DEFAULT if non-nil is a transition that should be taken when
640 ;; matching any possibility not explicitly listed in another
641 ;; TRANSITION, except for data characters containing only whitespace.
643 ;; INCSPACEP specifies how to handle data characters which include
644 ;; only whitespace characters. The value is non-nil to indicate
645 ;; (*include *same nil) or nil to indicate (*discard *same nil).
647 ;; TAG is a symbol corresponding to the start-tag we are looking at,
648 ;; or *data when seeing character data that includes at least one
649 ;; non-space character.
652 ;; *close: Close this element and try again using content model of
653 ;; enclosing element. (Note that this does not apply to the
654 ;; case of an element being closed by its own end-tag.)
655 ;; *include: Process new element as subelement of this one or
656 ;; include data characters directly.
657 ;; *discard: Discard a start-tag or data characters.
658 ;; *retry: Try again after processing NEW-STATE and ERRORP.
659 ;; ELEMENT: Open ELEMENT (with default attributes), then try again
660 ;; using its content model.
662 ;; NEW-STATE (optional, default *same) is the index of the state to
663 ;; move to after processing the element or one of these:
664 ;; *same: no state change occurs.
665 ;; *next: change the current state + 1.
666 ;; The initial state is 0. NEW-STATE does not matter if ACTION is
669 ;; ERRORP (optional, default nil) if non-nil indicates this transition
670 ;; represents an error. The error message includes this value if it
673 ;; If no matching transition is found, the default transition is
674 ;; (*discard *same "not allowed here").
676 ;; overrides: An alist of pairs of the form (VAR REPLACEP . VALUE).
677 ;; When this element is opened, the old value of VAR is saved in the
678 ;; undo-list. If REPLACEP is non-nil, then VAR gets value VALUE,
679 ;; otherwise VAR gets value (append VALUE (symbol-value VAR)). Useful
680 ;; values for VAR are:
682 ;; w3-p-d-exceptions: See doc string.
684 ;; w3-p-d-shortrefs: See doc string.
686 ;; w3-p-d-shortref-chars: See doc string.
688 ;; end-tag-omissible: Whether it is legal to omit the end-tag of this
689 ;; element. If an end-tag is inferred for an element whose end tag is
690 ;; not omissible, an error message is given.
692 ;; state: The current state in the content model. Preset to the initial
695 ;; undo-list: an alist of of former values of local variables
696 ;; of w3-parse-buffer to restore upon closing this element. Each
697 ;; item on the list is of the format (VAR . VALUE-TO-RESTORE).
699 ;; attributes: an alist of attributes and values. Each item on
700 ;; this list is of the format (ATTRIBUTE-NAME . VALUE). Each
701 ;; ATTRIBUTE-NAME is a symbol and each attribute value is a
704 ;; content: a list of the accumulated content of the element. While the
705 ;; element is open, the list is in order from latest to earliest,
706 ;; otherwise it is in order from earliest to latest. Each member is
707 ;; either a string of data characters or a list of the form (NAME
708 ;; ATTRIBUTES CONTENT), where NAME is the subelement's name, ATTRIBUTES
709 ;; is an alist of the subelement's attribute names (lowercase symbols)
710 ;; and their values (strings), and CONTENT is the subelement's content.
714 (defconst w3-element-fields
715 '(name end-tag-name content-model state overrides undo-list
716 content attributes end-tag-omissible deprecated))
718 (let* ((fields w3-element-fields)
719 (index (1- (length fields))))
721 (let* ((field (symbol-name (car fields)))
722 (get-sym (intern (concat "w3-element-" field)))
723 (set-sym (intern (concat "w3-set-element-" field))))
725 (defmacro ,get-sym (element)
726 (list 'aref element ,index))
727 (defmacro ,set-sym (element value)
728 (list 'aset element ,index value)))))
729 (setq fields (cdr fields))
730 (setq index (1- index))))
732 (defmacro w3-make-element ()
733 (list 'make-vector (length w3-element-fields) nil))
735 ;; *** move this to be with DTD declaration.
736 (defmacro w3-fresh-element-for-tag (tag)
738 (or (get ,tag 'html-element-info)
739 (error "unimplemented element %s"
740 (w3-sgml-name-to-string ,tag)))))
742 ;; *** move this to be with DTD declaration.
743 (defmacro w3-known-element-p (tag)
744 `(get ,tag 'html-element-info))
746 (defsubst w3-sgml-name-to-string (sym)
747 (upcase (symbol-name sym)))
753 ;;; Parse tree manipulation.
756 ;; ;; Find the name of the previous element or a substring of the
757 ;; ;; preceding data characters.
758 ;; (let ((content (w3-element-content (car stack))))
761 ;; ((and (stringp (car content))
762 ;; (not (string-match "\\`[ \t\n\r]*\\'" (car content))))
763 ;; (setq prior-item (car content))
764 ;; ;; Trim trailing whitespace
765 ;; (if (string-match "\\(.*[^ \t\n\r]\\)[ \t\n\r]*\\'" prior-item)
766 ;; (setq prior-item (substring prior-item 0 (match-end 1))))
767 ;; (if (> (length prior-item) 8)
768 ;; (setq prior-item (concat "..." (substring prior-item -8))))
769 ;; (setq prior-item (w3-quote-for-string prior-item))
770 ;; (setq prior-item (concat "\(after " prior-item "\)"))
771 ;; (setq content nil))
772 ;; ((and (consp (car content))
773 ;; (symbolp (car (car content))))
775 ;; (concat "\(after "
776 ;; (w3-sgml-name-to-string (car (car content)))
778 ;; (setq content nil))
780 ;; (setq content (cdr content))))))
782 ;; Only used for HTML debugging.
783 (defun w3-open-elements-string (&optional skip-count)
784 (let* ((stack (nthcdr (or skip-count 0)
785 (cons w3-p-d-current-element
786 w3-p-d-open-element-stack)))
787 ;;(prior-item "(at start)")
789 ;; Accumulate the names of the enclosing elements.
791 (let ((element (w3-element-name (car stack))))
792 (if (eq '*holder element)
794 ;; Only include *DOCUMENT if there are no other elements.
795 (if (or (not (eq '*document element))
797 (setq result (cons (w3-sgml-name-to-string element)
799 (setq stack (cdr stack)))
800 (setq result (mapconcat 'identity result ":"))
807 ;; *** This doesn't really belong here, but where?
809 (defmacro w3-invalid-sgml-chars ()
810 "Characters not allowed in an SGML document using the reference
811 concrete syntax (i.e. HTML). Returns a string in the format expected by
813 "\000-\010\013\014\016-\037\177-\237"))
817 ;; w3-p-d-null-end-tag-enabled, w3-p-d-in-parsed-marked-section,
818 ;; w3-p-d-shortref-chars
819 ;; Modifies free variable:
820 ;; w3-p-d-non-markup-chars
821 (defsubst w3-update-non-markup-chars ()
822 (setq w3-p-d-non-markup-chars
824 (w3-invalid-sgml-chars)
825 (if w3-p-d-null-end-tag-enabled "/" "")
826 (if w3-p-d-in-parsed-marked-section "]" "")
827 (or w3-p-d-shortref-chars ""))))
831 (w3-p-s-var-def w3-p-s-overrides)
832 (w3-p-s-var-def w3-p-s-undo-list)
833 (w3-p-s-var-def w3-p-s-var)
834 ;; Uses free variables:
835 ;; w3-p-d-non-markup-chars
836 ;; Modifies free variables:
837 ;; w3-p-d-current-element, w3-p-d-open-element-stack
838 ;; Destroys free variables:
839 ;; w3-p-s-overrides, w3-p-s-undo-list, w3-p-s-var
840 (defsubst w3-open-element (tag attributes)
842 ;; Push new element on stack.
843 (setq w3-p-d-open-element-stack (cons w3-p-d-current-element
844 w3-p-d-open-element-stack))
845 (setq w3-p-d-current-element (w3-fresh-element-for-tag tag))
847 ;; Warn if deprecated or obsolete.
848 (if (w3-element-deprecated w3-p-d-current-element)
849 (w3-debug-html :outer
850 (format "%s element %s."
852 (w3-element-deprecated w3-p-d-current-element))
855 (w3-sgml-name-to-string
856 (w3-element-name w3-p-d-current-element)))))
859 ;; *** we are not handling #CURRENT attributes (HTML has none).
860 (w3-set-element-attributes w3-p-d-current-element attributes)
861 ;; *** Handle default attribute values.
862 ;; *** Fix the attribute name for unnamed values. Right now they will
863 ;; be in the attribute list as items of the format (VALUE . VALUE) where
864 ;; both occurrences of VALUE are the same. The first one needs to be
865 ;; changed to the proper attribute name by consulting the DTD.
866 ;; ********************
868 ;; Handle syntax/semantics overrides of new current element.
869 (cond ((w3-element-overrides w3-p-d-current-element)
870 (setq w3-p-s-overrides
871 (w3-element-overrides w3-p-d-current-element))
872 (setq w3-p-s-undo-list nil)
873 (while w3-p-s-overrides
874 (setq w3-p-s-var (car (car w3-p-s-overrides)))
875 (setq w3-p-s-undo-list
876 (cons (cons w3-p-s-var
877 (symbol-value w3-p-s-var))
879 (set w3-p-s-var (if (car (cdr (car w3-p-s-overrides)))
880 (cdr (cdr (car w3-p-s-overrides)))
881 (append (cdr (cdr (car w3-p-s-overrides)))
882 (symbol-value w3-p-s-var))))
884 ;; Magic handling of w3-p-d-shortref-chars.
885 (cond ((eq 'w3-p-d-shortref-chars w3-p-s-var)
886 (setq w3-p-s-undo-list
887 (cons (cons 'w3-p-d-non-markup-chars
888 w3-p-d-non-markup-chars)
890 (w3-update-non-markup-chars)))
891 (setq w3-p-s-overrides (cdr w3-p-s-overrides)))
892 (w3-set-element-undo-list w3-p-d-current-element
895 ;; Handle content-model inheritance. (Very non-SGML!)
896 (if (eq 'XINHERIT (w3-element-content-model w3-p-d-current-element))
897 (w3-set-element-content-model
898 w3-p-d-current-element
899 (w3-element-content-model (car w3-p-d-open-element-stack))))
904 ;; The protocol for handing items to the display engine is as follows.
906 ;; For an element, send (START-TAG . ATTS), each member of the content,
907 ;; and (END-TAG . nil) if the element is allowed to have an end tag.
909 ;; For data characters, send (text . DATA-CHARACTERS).
913 ;; For PLAINTEXT, STYLE, XMP, TEXTAREA send:
914 ;; (START-TAG . ((data . DATA-CHARACTERS) . ATTS)).
916 ;; *** This requires somehow eliminating any subelements of the TEXTAREA
917 ;; element. TEXTAREA can contain subelements in HTML 3.0.
919 ;; For LISTING, send (text . DATA-CHARACTERS). (Is this really correct or
920 ;; is this perhaps a bug in the old parser?) I'm ignoring this for now.
923 (w3-p-s-var-def w3-p-s-undo-list)
924 (w3-p-s-var-def w3-p-s-content)
925 (w3-p-s-var-def w3-p-s-end-tag)
926 ;; Modifies free variables:
927 ;; w3-p-d-current-element, w3-p-d-open-element-stack
928 ;; Accesses free variables:
929 ;; w3-p-d-tag-name, w3-p-d-end-tag-p
930 ;; Destroys free variables:
931 ;; w3-p-s-undo-list, w3-p-s-content, w3-p-s-end-tag
932 (defsubst w3-close-element (&optional inferred)
933 ;; inferred: non-nil if the end-tag of the current element is being
934 ;; inferred due to the presence of content not allowed in the current
935 ;; element. If t, then the tag causing this is in w3-p-d-tag-name and
937 ;; (OLD: ... otherwise it is a symbol indicating the start-tag
938 ;; of an element or *data or *space indicating data characters.)
941 (not (w3-element-end-tag-omissible w3-p-d-current-element)))
943 (format "</%s> end-tag not omissible (required due to %s)"
944 (w3-sgml-name-to-string
945 (w3-element-name w3-p-d-current-element))
946 (cond ((eq t inferred)
947 (format (if w3-p-d-end-tag-p
950 (w3-sgml-name-to-string
952 ;; *** Delete this functionality?
953 ((memq inferred '(*space *data))
956 (format "start-tag for %s"
957 (w3-sgml-name-to-string inferred)))
960 ;; Undo any variable bindings of this element.
961 (cond ((w3-element-undo-list w3-p-d-current-element)
962 (setq w3-p-s-undo-list
963 (w3-element-undo-list w3-p-d-current-element))
964 (while w3-p-s-undo-list
965 (set (car (car w3-p-s-undo-list))
966 (cdr (car w3-p-s-undo-list)))
967 (setq w3-p-s-undo-list (cdr w3-p-s-undo-list)))))
970 (w3-element-end-tag-name w3-p-d-current-element))
972 ;; Fix up the content of the current element in preparation for putting
974 ;; Remove trailing newline from content, if there is one, otherwise send
975 ;; any trailing data character item to display engine.
976 (setq w3-p-s-content (w3-element-content w3-p-d-current-element))
977 (cond ((null w3-p-s-content))
978 ((equal "\n" (car w3-p-s-content))
979 (setq w3-p-s-content (cdr w3-p-s-content)))
982 (cond ;; *** Handle LISTING the way the old parser did.
983 ((eq 'EMPTY (w3-element-content-model w3-p-d-current-element))
984 ;; Do nothing, can't have an end tag.
988 (if (null w3-p-s-content)
991 ;; Don't warn for empty TD elements or empty A elements
992 ;; with no HREF attribute.
993 ;; *** Crude hack that should really be encoded in the
994 ;; element database somehow.
995 (if (or (not (memq (w3-element-name w3-p-d-current-element)
998 (w3-element-attributes w3-p-d-current-element)))
999 (format "Empty %s element."
1000 (w3-sgml-name-to-string
1001 (w3-element-name w3-p-d-current-element))))))))
1003 ;; Put the current element in the proper place in its parent.
1004 ;; This will cause an error if we overpop the stack.
1005 (w3-set-element-content
1006 (car w3-p-d-open-element-stack)
1007 (cons (list (w3-element-name w3-p-d-current-element)
1008 (w3-element-attributes w3-p-d-current-element)
1009 (nreverse w3-p-s-content))
1010 (w3-element-content (car w3-p-d-open-element-stack))))
1013 (setq w3-p-d-current-element (car w3-p-d-open-element-stack))
1014 (setq w3-p-d-open-element-stack (cdr w3-p-d-open-element-stack)))
1020 ;;; A pseudo-DTD for HTML.
1024 ;; This works around the following bogus compiler complaint:
1025 ;; While compiling the end of the data in file w3-parse.el:
1026 ;; ** the function w3-expand-parameters is not known to be defined.
1027 ;; This is a bogus error. Anything of this form will trigger this message:
1028 ;; (eval-when-compile (defun xyzzy () (xyzzy)))
1029 (defun w3-expand-parameters (pars data) nil))
1032 (defun w3-expand-parameters (pars data)
1036 ;; This has to be written carefully to avoid exceeding the
1037 ;; maximum lisp function call nesting depth.
1040 (let ((car-exp (w3-expand-parameters pars (car data))))
1042 (if (and (symbolp (car data))
1043 (not (eq car-exp (car data)))
1044 ;; An expansion occurred.
1046 ;; The expansion was a list, which we splice in.
1048 (append (reverse car-exp) result)
1049 (wrong-type-argument
1050 (if (eq 'listp (nth 1 err))
1051 ;; Wasn't really a "list" since the last
1052 ;; cdr wasn't nil, so don't try to splice
1054 (cons car-exp result)
1055 (signal (car err) (cdr err)))))
1056 (cons car-exp result))))
1057 (setq data (cdr data)))
1058 (append (nreverse result)
1059 (w3-expand-parameters pars data))))
1061 (let ((sym-exp (cdr-safe (assq data pars))))
1063 (w3-expand-parameters pars sym-exp)
1067 (result (copy-sequence data)))
1068 (while (< i (length data))
1070 (w3-expand-parameters pars (aref data i)))
1077 (defun w3-unfold-dtd (items)
1080 (let* ((item (car items))
1083 (or (cdr-safe (assq 'content-model item))
1084 (error "impossible")))
1085 (overrides (cdr-safe (assq 'overrides item)))
1087 (or (cdr-safe (assq 'end-tag-omissible item))
1088 ;; *** Is this SGML standard?
1089 (eq 'EMPTY content-model)))
1090 (deprecated (cdr-safe (assq 'deprecated item)))
1094 (setq name (car names))
1095 (setq names (cdr names))
1097 ;; Create and initialize the element information data
1099 (setq element (w3-make-element))
1100 (w3-set-element-name element name)
1101 (w3-set-element-end-tag-name
1103 (intern (concat "/" (symbol-name name))))
1104 (w3-set-element-state element 0)
1105 (w3-set-element-content-model element content-model)
1106 (w3-set-element-end-tag-omissible element end-tag-omissible)
1108 (or (memq deprecated '(nil t obsolete))
1109 (error "impossible"))
1110 (w3-set-element-deprecated element deprecated)
1112 ;; Inclusions and exclusions are specified differently in the
1113 ;; human-coded DTD than in the format the implementation uses.
1114 ;; The human-coded version is designed to be easy to edit and to
1115 ;; work with w3-expand-parameters while the internal version is
1116 ;; designed to be fast. We have to translate here. This work
1117 ;; is repeated for every element listed in `names' so that the
1118 ;; exclusion exception error messages can be accurate.
1119 (let ((inclusions (cdr-safe (assq 'inclusions item)))
1120 (exclusions (cdr-safe (assq 'exclusions item)))
1121 (exclusion-mode '*close)
1123 (format "%s exclusion" (w3-sgml-name-to-string name)))
1126 (setq exceptions (cons (cons (car inclusions)
1127 '(*include *same nil))
1129 (setq inclusions (cdr inclusions)))
1131 (cond ((memq (car exclusions) '(*discard *include *close))
1132 (setq exclusion-mode (car exclusions)))
1133 ((stringp (car exclusions))
1134 (setq exclusion-message (car exclusions)))
1136 (setq exceptions (cons (list (car exclusions)
1141 (setq exclusions (cdr exclusions)))
1142 (let ((overrides (if exceptions
1143 (cons (cons 'w3-p-d-exceptions
1144 (cons nil exceptions))
1147 (w3-set-element-overrides element overrides)))
1149 (setq result (cons (cons name element) result))))
1150 (setq items (cdr items)))
1153 ;; Load the HTML DTD.
1154 ;; <URL:ftp://ds.internic.net/rfc/rfc1866.txt>
1155 ;; *** Be sure to incorporate rfc1867 when attribute-checking is added.
1156 ;; *** Write function to check sanity of the content-model forms.
1157 ;; *** I18N: Add Q, BDO, SPAN
1161 (put (car pair) 'html-element-info (cdr pair))))
1162 ;; The purpose of this complexity is to speed up loading by
1163 ;; pre-evaluating as much as possible at compile time.
1166 (w3-expand-parameters
1168 (%headempty . (link base meta range))
1169 (%headmisc . (script))
1170 (%head-deprecated . (nextid))
1172 ;; client-side imagemaps
1173 (%imagemaps . (area map))
1174 (%input.fields . (input select textarea keygen label))
1175 ;; special action is taken for %text inside %body.content in the
1176 ;; content model of each element.
1177 (%body.content . (%heading %block style hr div address %imagemaps))
1179 (%heading . (h1 h2 h3 h4 h5 h6))
1181 ;; Emacs-w3 extensions
1182 (%emacsw3-crud . (pinhead flame cookie yogsothoth hype peek))
1184 (%block . (p %list dl form %preformatted
1185 %blockquote isindex fn table fig note
1186 multicol center %block-deprecated %block-obsoleted))
1188 (%preformatted . (pre))
1189 (%blockquote . (bq))
1190 (%block-deprecated . (dir menu blockquote))
1191 (%block-obsoleted . (xmp listing))
1193 ;; Why is IMG in this list?
1194 (%pre.exclusion . (*include img *discard tab math big small sub sup))
1196 (%text . (*data b %notmath sub sup %emacsw3-crud %input.fields))
1197 (%notmath . (%special %font %phrase %misc))
1198 (%font . (i u s strike tt big small sub sup font
1199 roach secret wired)) ;; B left out for MATH
1200 (%phrase . (em strong dfn code samp kbd var cite blink))
1201 (%special . (a nobr img applet object font basefont br script style map math tab span bdo))
1202 (%misc . (q lang au person acronym abbrev ins del))
1204 (%formula . (*data %math))
1205 (%math . (box above below %mathvec root sqrt array sub sup
1207 (%mathvec . (vec bar dot ddot hat tilde))
1208 (%mathface . (b t bt))
1210 (%mathdelims . (over atop choose left right of))
1212 ;; What the hell? This takes BODYTEXT????? No way!
1213 (%bq-content-model . [(nil
1215 (((bodytext) *include *next))
1219 (((credit) *include *next))
1224 ;; non-default bad HTML handling.
1225 (%in-text-ignore . ((p %heading) *discard *same error))
1228 ;; A dummy element that will contain *document.
1230 (content-model . [(nil nil nil nil)]))
1231 ;; The root of the parse tree. We start with a pseudo-element
1232 ;; named *document for convenience.
1234 (content-model . [(nil nil (((html) *include *next)) (html *next))
1238 (*include *same "after document end"))])
1239 (end-tag-omissible . t))
1240 ;; HTML O O (HEAD, BODY)
1242 (content-model . [(nil
1244 (((head) *include *next))
1248 (((body) *include *next)
1250 ((frameset) *include 4)
1255 (((plaintext) *include *next))
1260 (*include *same "after BODY"))
1264 (*include *same "after FRAMESET"))
1266 (end-tag-omissible . t))
1268 (content-model . [((title isindex %headempty %headmisc
1269 style %head-deprecated)
1272 ;; *** Should only close if tag can
1273 ;; legitimately follow head. So many can that
1274 ;; I haven't bothered to enumerate them.
1276 (end-tag-omissible . t))
1277 ;; SCRIPT - - (#PCDATA)
1279 (content-model . XCDATA ; not official, but allows
1280 ; comment hiding of script, and also
1281 ; idiots that use '</' in scripts.
1283 ;; TITLE - - (#PCDATA)
1285 (content-model . RCDATA ; not official
1286 ;; [((*data) include-space nil nil)]
1288 ;; STYLE - O (#PCDATA)
1289 ;; STYLE needs to be #PCDATA to allow omitted end tag. Bleagh.
1291 (content-model . CDATA)
1292 (end-tag-omissible . t))
1294 (content-model . [((banner) nil nil (*retry *next))
1295 ((bodytext) nil nil (bodytext *next))
1296 (nil nil (((plaintext) *close)) nil)])
1297 (inclusions . (spot))
1298 (end-tag-omissible . t))
1299 ;; Do I really want to include BODYTEXT? It has something to do
1300 ;; with mixed content screwing things up, and I don't understand
1301 ;; it. Wait! It's used by BQ!
1303 (content-model . [((%body.content)
1305 ;; Push <P> before data characters. Non-SGML.
1307 ;; Some stupid sites put meta tags in the
1308 ;; middle of their documents. Sigh.
1309 ;; Allow it, but bitch and moan.
1310 ((meta) *include *same "not allowed here")
1311 ;; Closing when seeing CREDIT is a stupidity
1312 ;; caused by BQ's sharing of BODYTEXT. BQ
1313 ;; should have its own BQTEXT.
1314 ((credit plaintext) *close))
1317 (end-tag-omissible . t))
1318 ((div banner center multicol)
1319 (content-model . [((%body.content)
1321 ;; Push <P> before data characters. Non-SGML.
1325 (content-model . [((p)
1327 ;; Push <P> before data characters. Non-SGML.
1331 (content-model . [((%text)
1336 (content-model . [((%text)
1342 (content-model . [((%text)
1345 ;; *** Should only close if tag can
1346 ;; legitimately follow P. So many can that I
1347 ;; don't bother to enumerate here.
1349 (end-tag-omissible . t))
1351 (content-model . [((lh)
1353 (((li) *include *next))
1361 ;; Push <LI> before data characters or block
1364 (;; ((p) b *same nil)
1365 ((%text %block) li *same error))
1368 (content-model . [((%text)
1370 (((dd dt li) *close)
1373 (end-tag-omissible . t))
1375 (content-model . [((li)
1377 (((%text) li *same error))
1379 (exclusions . (%block)))
1381 (content-model . [((%block)
1384 ;; Push <P> before data characters. Non-SGML.
1387 (end-tag-omissible . t)
1388 ;; Better bad HTML handling.
1389 ;; Technically, there are a few valid documents that this will
1390 ;; hose, because you can have H1 inside FORM inside LI. However,
1391 ;; I don't think that should be allowed anyway.
1392 (exclusions . (*discard "not allowed here" %heading)))
1394 (content-model . [((lh)
1396 (((dt dd) *include *next))
1400 ;; Push <DD> before data characters or block
1403 (((%text %block) dd *same error))
1406 (content-model . [((%text)
1411 (end-tag-omissible . t))
1412 ;; DD is just like LI, but we treat it separately because it can be
1413 ;; followed by a different set of elements.
1415 (content-model . [((%block)
1418 ;; Push <P> before data characters. Non-SGML.
1421 (end-tag-omissible . t)
1422 ;; See comment with LI.
1423 (exclusions . (*discard "not allowed here" %heading)))
1425 (content-model . [((%text hr)
1429 (exclusions . (%pre.exclusion)))
1430 ;; BLOCKQUOTE deprecated, BQ okay
1432 (content-model . %bq-content-model))
1434 (content-model . %bq-content-model)
1435 ;; BLOCKQUOTE is deprecated in favor of BQ in the HTML 3.0 DTD.
1436 ;; However, BQ is not even mentioned in the HTML 2.0 DTD. So I
1437 ;; don't think we can enable this yet:
1441 (content-model . [((%body.content)
1443 ;; Push <P> before data characters. Non-SGML.
1447 (content-model . [((overlay) nil nil (*retry *next))
1450 (((caption) *include *next))
1454 (((figtext) *include *next)
1455 ((credit) *retry *next))
1456 ;; *** Should only do this for elements that
1457 ;; can be in FIGTEXT.
1459 (nil nil (((credit) *include *next)) nil)
1460 (nil nil nil nil)]))
1462 (content-model . [((%text)
1467 (content-model . [((%body.content)
1469 ;; Push <P> before data characters. Very non-SGML.
1473 (end-tag-omissible . t))
1474 ((%emacsw3-crud basefont)
1475 (content-model . EMPTY))
1476 ;; FORM - - %body.content -(FORM) +(INPUT|KEYGEN|SELECT|TEXTAREA)
1478 ;; Same as BODY. Ugh!
1479 (content-model . [((%body.content %text)
1481 ;; Push <P> before data characters. Non-SGML.
1484 (exclusions . (form))
1485 (inclusions . (input select textarea keygen label)))
1486 ;; *** Where is the URL describing this?
1488 (content-model . [((%text)
1492 ;; *** These are already included, no need to repeat.
1493 ;;(inclusions . (input select textarea))
1494 ;; *** Is a LABEL allowed inside a LABEL? I assume no.
1495 (exclusions . (label))
1496 ;; The next line just does the default so is unneeded:
1497 ;;(end-tag-omissible . nil)
1499 ;; SELECT - - (OPTION+) -(INPUT|KEYGEN|TEXTAREA|SELECT)>
1500 ;; *** This should be -(everything).
1502 (content-model . [((option) nil nil nil)])
1503 (exclusions . (input label select keygen textarea)))
1504 ;; option - O (#PCDATA)
1505 ;; needs to be #PCDATA to allow omitted end tag.
1507 ;; I'd like to make this RCDATA to avoid problems with inclusions
1508 ;; like SPOT, but that would conflict with the omitted end-tag, I
1510 (content-model . [((*data)
1514 (end-tag-omissible . t))
1515 ;; TEXTAREA - - (#PCDATA) -(INPUT|TEXTAREA|KEYGEN|SELECT)
1517 ;; Same comment as for OPTION about RCDATA.
1518 (content-model . XCDATA) ;;;[((*data) include-space nil nil)])
1519 (exclusions . (input select label keygen textarea)))
1520 ((hr br img isindex input keygen overlay wbr spot tab
1521 %headempty %mathdelims)
1522 (content-model . EMPTY))
1524 (content-model . EMPTY)
1527 (content-model . [((%text)
1530 *include *same "deprecated inside A")
1531 ;; *** I haven't made up my mind whether this
1532 ;; is a good idea. It can result in a lot of
1533 ;; bad formatting if the A is *never* closed.
1534 ;;((p) *discard *same error)
1538 ((b font %font %phrase %misc nobr)
1539 (content-model . [((%text)
1544 (content-model . XXCDATA)
1545 (end-tag-omissible . t)
1546 (deprecated . obsolete))
1548 (content-model . XCDATA)
1549 (deprecated . obsolete))
1550 ;; Latest table spec (as of Nov. 13 1995) is at:
1551 ;; <URL:ftp://ds.internic.net/internet-drafts/draft-ietf-html-tables-03.txt>
1553 (content-model . [(nil
1555 (((caption) *include *next)
1556 ((%text) tr *same error)
1557 ((col colgroup thead tfoot tbody tr) *retry *next))
1558 (*retry *next)) ;error handling
1561 (((thead tfoot tbody tr) *retry *next))
1562 (*retry *next)) ;error handling
1565 (((thead) *include *next)
1566 ((tfoot tbody tr) *retry *next))
1567 (*retry *next)) ;error handling
1570 (((tfoot) *include *next)
1571 ((tbody tr) *retry *next))
1572 (*retry *next)) ;error handling
1578 ((%body.content) tbody *same error))
1581 (content-model . [((col)
1583 (((colgroup thead tfoot tbody tr) *close))
1585 (end-tag-omissible . t))
1587 (content-model . EMPTY))
1589 (content-model . [((tr)
1591 (((tfoot tbody) *close)
1593 ((%body.content) tr *same error))
1595 (end-tag-omissible . t))
1597 (content-model . [((tr)
1601 ((td th) tr *same error)
1602 ((%body.content) tr *same error))
1604 (end-tag-omissible . t))
1606 (content-model . [((td th)
1608 (((tr tfoot tbody) *close)
1610 ((%body.content %text) td *same error))
1612 (end-tag-omissible . t))
1614 ;; Arrgh! Another %body.content!!! Stupid!!!
1615 (content-model . [((%body.content)
1617 (((td th tr tfoot tbody) *close)
1618 ;; Push <P> before data characters. Non-SGML.
1621 (end-tag-omissible . t))
1623 (content-model . [((*data) include-space nil nil)])
1625 ((w3-p-d-shortref-chars t . "\{_^")
1626 (w3-p-d-shortrefs t . (("\\^" . "<sup>")
1629 (inclusions . (%math))
1630 (exclusions . (%notmath)))
1632 (content-model . [((%text)
1637 ((w3-p-d-shortref-chars t . "\{_^")
1638 (w3-p-d-shortrefs t . (("\\^" . "</sup>")
1640 ("{" . "<box>"))))))
1642 (content-model . [((%text)
1647 ((w3-p-d-shortref-chars t . "\{_^")
1648 (w3-p-d-shortrefs t . (("\\^" . "<sup>")
1650 ("{" . "<box>"))))))
1652 (content-model . [((%formula)
1654 (((left) *include 1)
1655 ((over atop choose) *include 2)
1656 ((right) *include 3))
1660 (((over atop choose) *include 2)
1661 ((right) *include 3))
1665 (((right) *include 3))
1667 ((%formula) include-space nil nil)])
1669 ((w3-p-d-shortref-chars t . "{}_^")
1670 (w3-p-d-shortrefs t . (("\\^" . "<sup>")
1673 ("}" . "</box>"))))))
1674 ((above below %mathvec t bt sqrt)
1675 (content-model . [((%formula) include-space nil nil)]))
1676 ;; ROOT has a badly-specified content-model in HTML 3.0.
1678 (content-model . [((%formula)
1680 (((of) *include *next))
1682 ((%formula) include-space nil nil)]))
1684 (content-model . [((%formula) include-space nil nil)])
1685 ;; There is no valid way to infer a missing end-tag for OF. This
1687 (end-tag-omissible . t))
1689 (content-model . [((row) nil nil nil)]))
1691 (content-model . [((item) nil (((row) *close)) nil)])
1692 (end-tag-omissible . t))
1694 (content-model . [((%formula)
1696 (((row item) *close))
1698 (end-tag-omissible . t))
1699 ;; The old parser would look for the </EMBED> end-tag and include
1700 ;; the contents between <EMBED> and </EMBED> as the DATA attribute
1701 ;; of the EMBED start-tag. However, it did not require the
1702 ;; </EMBED> end-tag and did nothing if it was missing. This is
1703 ;; completely impossible to specify in SGML.
1706 ;; <URL:http://www.eit.com/goodies/lists/www.lists/www-html.1995q3/0603.html>
1708 ;; Questions: Does EMBED require the end-tag? How does NOEMBED fit
1709 ;; into this? Where can EMBED appear?
1711 ;; Nov. 25 1995: a new spec for EMBED (also an I-D):
1712 ;; <URL:http://www.cs.princeton.edu/~burchard/www/interactive/>
1714 ;; Here is my guess how to code EMBED:
1716 (content-model . [((noembed) nil nil (*close))]))
1718 (content-model . [((%body.content) ; hack hack hack
1723 ;; FRAMESET is a Netscape thing.
1724 ;; <URL:http://www.eit.com/goodies/lists/www.lists/www-html.1995q3/0588.html>
1726 (content-model . [((noframes frame frameset) nil nil nil)]))
1728 (content-model . [((%body.content)
1730 ;; Push <P> before data characters. Non-SGML.
1734 (content-model . EMPTY))
1736 ;; APPLET is a Java thing.
1737 ;; OBJECT is a cougar thing
1738 ;; <URL:http://java.sun.com/JDK-beta/filesinkit/README>
1740 ;; I really don't want to add another ANY content-model.
1741 (content-model . XINHERIT)
1742 (inclusions . (param)))
1744 (content-model . EMPTY))
1745 ;; backward compatibility with old Java.
1747 (content-model . EMPTY))
1748 ;; Client-side image maps.
1749 ;; <URL:ftp://ds.internic.net/internet-drafts/draft-seidman-clientsideimagemap-01.txt>
1750 ;; *** The only problem is that I don't know in what elements MAP
1751 ;; can appear, so none of this is reachable yet.
1753 (content-model . [((area) nil nil nil)]))
1755 (content-model . EMPTY))
1760 ;;; Omitted tag inference using state transition tables.
1765 (w3-p-s-var-def w3-p-s-includep)
1766 (w3-p-s-var-def w3-p-s-state-transitions)
1767 (w3-p-s-var-def w3-p-s-transition)
1768 (w3-p-s-var-def w3-p-s-tran-list)
1769 (w3-p-s-var-def w3-p-s-content-model)
1770 (w3-p-s-var-def w3-p-s-except)
1771 (w3-p-s-var-def w3-p-s-baseobject)
1772 (w3-p-s-var-def w3-p-s-btdt)
1773 ;; Uses free variables:
1774 ;; w3-p-d-current-element, w3-p-d-exceptions
1775 ;; Destroys free variables:
1776 ;; w3-p-s-includep, w3-p-s-state-transitions, w3-p-s-transition,
1777 ;; w3-p-s-tran-list, w3-p-s-content-model, w3-p-s-except
1778 ;; Returns t if the element or data characters should be included.
1779 ;; Returns nil if the element or data characters should be discarded.
1780 (defsubst w3-grok-tag-or-data (tag-name)
1783 ((symbolp (setq w3-p-s-content-model
1784 (w3-element-content-model w3-p-d-current-element)))
1785 (or (and (memq w3-p-s-content-model
1786 '(CDATA RCDATA XCDATA XXCDATA))
1787 (memq tag-name '(*data *space)))
1788 ;; *** Implement ANY.
1789 (error "impossible content model lossage"))
1790 (setq w3-p-s-includep t)
1794 ;; We have a complex content model.
1795 ;; Cache some data from the element info structure. Format is:
1796 ;; (INCLUDES INCSPACEP (((TAG ...) . TRANSITION) ...) DEFAULT)
1797 (setq w3-p-s-state-transitions
1798 (aref w3-p-s-content-model
1799 (w3-element-state w3-p-d-current-element)))
1801 ;; Optimize the common cases.
1803 ((eq '*space tag-name)
1804 ;; Optimizing the (*space *discard *same nil) transition.
1805 (setq w3-p-s-includep (car (cdr w3-p-s-state-transitions)))
1808 ((and (not (setq w3-p-s-except
1809 (assq tag-name w3-p-d-exceptions)))
1810 (memq tag-name (car w3-p-s-state-transitions)))
1811 ;; Equivalent to a transition of (TAG *include *same nil).
1812 ;; So we are done, return t to caller.
1813 (setq w3-p-s-includep t)
1817 ;; The general case.
1819 ;; Handle inclusions and exclusions.
1821 (setq w3-p-s-transition (cdr w3-p-s-except)))
1822 ;; See if the transition is in the complex transitions
1825 (setq w3-p-s-tran-list
1826 (car (cdr (cdr w3-p-s-state-transitions))))
1827 (setq w3-p-s-transition nil)
1828 (while w3-p-s-tran-list
1829 (cond ((memq tag-name (car (car w3-p-s-tran-list)))
1830 ;; We've found a transition.
1831 (setq w3-p-s-transition
1832 (cdr (car w3-p-s-tran-list)))
1833 (setq w3-p-s-tran-list nil))
1835 (setq w3-p-s-tran-list (cdr w3-p-s-tran-list)))))
1836 ;; Check if we found it.
1838 ;; body of cond clause empty
1840 ;; Try finding the transition in the DEFAULT component of the
1841 ;; transition table, but avoid doing this for unknown elements,
1842 ;; always use the default-default for them.
1843 ((and (or (eq '*data tag-name)
1844 (w3-known-element-p tag-name))
1845 (setq w3-p-s-transition
1846 (nth 3 w3-p-s-state-transitions)))
1847 ;; body of cond clause empty
1850 ;; Supply a default-default transition.
1851 (if (not (or (eq '*data tag-name)
1852 (w3-known-element-p tag-name)))
1853 (setq w3-p-s-transition
1854 '(*discard *same "unknown element"))
1856 ;; Decide whether to *close or *discard
1857 ;; based on whether this element would be
1858 ;; accepted as valid in an open ancestor.
1859 (let ((open-list w3-p-d-open-element-stack)
1860 (all-end-tags-omissible
1861 (w3-element-end-tag-omissible w3-p-d-current-element))
1862 state-transitions tran-list)
1865 (setq state-transitions
1866 (aref (w3-element-content-model
1868 (w3-element-state (car open-list))))
1869 (if (memq tag-name (car state-transitions))
1871 (setq tran-list (nth 2 state-transitions))
1873 (cond ((memq tag-name (car (car tran-list)))
1874 (if (not (nth 3 (car tran-list)))
1875 ;; Not an error transition.
1877 (setq tran-list nil))
1879 (setq tran-list (cdr tran-list)))))
1880 ;; The input item is not accepted in this
1881 ;; ancestor. Try again in next ancestor.
1882 (or (w3-element-end-tag-omissible (car open-list))
1883 (setq all-end-tags-omissible nil))
1884 (setq open-list (cdr open-list)))
1886 (setq w3-p-s-transition
1887 (if (w3-element-end-tag-omissible
1888 w3-p-d-current-element)
1889 (if all-end-tags-omissible
1890 ;; Probably indicates a need to debug
1891 ;; the DTD state-transition tables.
1893 "missing transition in DTD?")
1894 ;; Error will be reported later.
1896 '(*close *same "not allowed here")))
1897 (setq w3-p-s-transition
1898 '(*discard *same "not allowed here")))))))
1900 ;; We have found a transition to take. The transition is of
1901 ;; the format (ACTION NEW-STATE ERRORP) where the latter two
1902 ;; items are optional.
1904 ;; First, handle any state-change.
1905 (or (memq (car-safe (cdr w3-p-s-transition)) '(nil *same))
1906 (w3-set-element-state
1907 w3-p-d-current-element
1908 (if (eq '*next (car-safe (cdr w3-p-s-transition)))
1909 (1+ (w3-element-state w3-p-d-current-element))
1910 (car-safe (cdr w3-p-s-transition)))))
1912 ;; Handle any error message.
1913 (if (car-safe (cdr-safe (cdr w3-p-s-transition)))
1915 :mandatory-if (and (eq '*data tag-name)
1916 (eq '*discard (car w3-p-s-transition)))
1917 (format "Bad %s [%s], %s"
1918 (if (eq '*data tag-name)
1920 (concat "start-tag "
1921 (w3-sgml-name-to-string tag-name)))
1922 (if (stringp (car (cdr (cdr w3-p-s-transition))))
1923 (car (cdr (cdr w3-p-s-transition)))
1925 (let ((action (car w3-p-s-transition)))
1926 (cond ((eq '*discard action)
1927 "discarding bad item")
1928 ((eq '*close action)
1929 (concat "inferring </"
1930 (w3-sgml-name-to-string
1932 w3-p-d-current-element))
1934 ((eq '*include action)
1935 "including bad item anyway")
1936 ((eq '*retry action)
1937 "*retry ??? you shouldn't see this")
1939 (concat "inferring <"
1940 (w3-sgml-name-to-string action)
1943 ;; Handle the action.
1945 ((eq '*include (car w3-p-s-transition))
1946 (setq w3-p-s-includep t)
1949 ((eq '*close (car w3-p-s-transition))
1950 ;; Perform end-tag inference.
1951 (w3-close-element) ; don't pass parameter
1952 ;; Loop and try again in parent element's content-model.
1954 ((eq '*discard (car w3-p-s-transition))
1955 (setq w3-p-s-includep nil)
1958 ((eq '*retry (car w3-p-s-transition))
1959 ;; Loop and try again after state change.
1961 ((symbolp (car w3-p-s-transition))
1962 ;; We need to open another element to contain the text,
1963 ;; probably a <P> (look in the state table).
1964 (w3-open-element (car w3-p-s-transition) nil)
1965 ;; Now we loop and try again in the new element's
1969 (error "impossible transition")))))))
1971 ;; Empty while loop body.
1974 ;; Return value to user indicating whether to include or discard item:
1986 (defvar w3-last-parse-tree nil
1987 "Used for debugging only. Stores the most recently computed parse tree
1988 \(a tree, not a parse tag stream\).")
1990 (defun w3-display-parse-tree (&optional ptree)
1992 (with-output-to-temp-buffer "W3 HTML Parse Tree"
1993 (set-buffer standard-output)
1996 (pp (or ptree w3-last-parse-tree))))
1998 (defalias 'w3-display-last-parse-tree 'w3-display-parse-tree)
2000 ;; For compatibility with the old parser interface.
2001 (defalias 'w3-preparse-buffer 'w3-parse-buffer)
2003 (defcustom w3-parse-hooks nil
2004 "*List of hooks to be run before parsing."
2007 :options '(w3-parse-munge-ethiopic-text) ; too exotic for a default
2010 (defun w3-parse-munge-ethiopic-text ()
2011 "Treat marked-up regions using `ethio-sera-to-fidel-marker'.
2012 Do nothing in non-Mule or unibyte session."
2013 (when (and (featurep 'mule)
2014 (or (featurep 'xemacs)
2016 (boundp 'default-enable-multibyte-characters)
2017 default-enable-multibyte-characters)))
2018 (ethio-sera-to-fidel-marker)))
2020 (if (fboundp 'char-int)
2021 (defalias 'w3-char-int 'char-int)
2022 (defalias 'w3-char-int 'identity))
2024 ;; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2026 ;; % This is the *ONLY* valid entry point in this file! %
2027 ;; % DO NOT call any of the other functions! %
2029 ;; %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2030 (defun w3-slow-parse-buffer (&optional buff)
2031 "Parse contents of BUFF as HTML.
2032 BUFF defaults to the current buffer.
2033 Destructively alters contents of BUFF.
2034 Returns a data structure containing the parsed information."
2035 (if (not w3-setup-done) (w3-do-setup))
2039 (setq buff (current-buffer)))
2040 (let ((old-syntax-table (syntax-table)))
2041 (set-syntax-table w3-sgml-md-syntax-table)
2042 (buffer-disable-undo (current-buffer))
2043 (widen) ; sanity checking
2044 (goto-char (point-max))
2046 (goto-char (point-min))
2047 (setq case-fold-search t) ; allows smaller regexp patterns
2049 (run-hooks 'w3-parse-hooks);
2051 (goto-char (point-min))
2053 ;; *** Should premunge line boundaries.
2054 ;; ********************
2057 ;; Speed hack, see the variable doc string.
2058 (gc-cons-threshold (if (> w3-gc-cons-threshold-multiplier 0)
2059 (* w3-gc-cons-threshold-multiplier
2063 ;; Used to determine if we made any progress since the last loop.
2064 (last-loop-start (point-min))
2066 ;; How many iterations of the main loop have occurred. Used only
2067 ;; to send messages to the user periodically, since this function
2068 ;; can take some time.
2071 ;; Precomputing the loop-invariant parts of this for speed.
2072 (status-message-format
2074 (format "Parsed %%3d%%%% of %d..." (- (point-max) (point-min)))))
2076 ;; Use a float value for 100 if possible, otherwise integer.
2077 ;; Determine which we can use outside of the loop for speed.
2078 (one-hundred (funcall (if (fboundp 'float) 'float 'identity) 100))
2080 ;; The buffer which contains the HTML we are parsing. This
2081 ;; variable is used to avoid using the more expensive
2083 (parse-buffer (current-buffer))
2085 ;; Points to start of region of text since the previous tag.
2086 (between-tags-start (point-min))
2088 ;; Points past end of region of text since the previous tag. Only
2089 ;; non-nil when the region has been completely determined and is
2090 ;; ready to be processed.
2099 ;; Is the tag we are looking at a null-end-tag-enabling
2103 ;; Attributes of the tag we are looking at. An alist whose items
2104 ;; are pairs of the form (SYMBOL . STRING).
2107 ;; Points past end of attribute value we are looking at. Points
2108 ;; past the syntactic construct, not the value of the attribute,
2109 ;; which may be at (1- attribute-value-end).
2112 ;; Points past end of tag we are looking at.
2116 (w3-p-d-current-element (w3-fresh-element-for-tag '*document))
2119 (w3-p-d-open-element-stack (list (w3-fresh-element-for-tag '*holder)))
2121 ;; ***not implemented yet***
2122 (marked-section-undo-stack nil)
2125 (w3-p-d-debug-url t)
2127 ;; Any of the following variables with the comment ";*NESTED*"
2128 ;; are syntactic or semantic features that were introduced by
2129 ;; some containing element or marked section which will be undone
2130 ;; when we close that element or marked section.
2133 (w3-p-d-non-markup-chars nil) ;*NESTED*
2136 (w3-p-d-null-end-tag-enabled nil) ;*NESTED*
2139 (w3-p-d-in-parsed-marked-section nil) ;*NESTED*
2142 (w3-p-d-shortrefs nil) ;*NESTED*
2145 (w3-p-d-shortref-chars nil) ;*NESTED*
2147 ;; ******* maybe not needed.
2149 ;; ;; Are we recognizing start-tags?
2150 ;; (recognizing-start-tags t) ;*NESTED*
2152 ;; ;; Are we recognizing end-tags? If this is non-nil and not t,
2153 ;; ;; then only the end tag of the current open element is
2155 ;; (recognizing-end-tags t) ;*NESTED*
2158 (w3-p-d-exceptions nil) ;*NESTED*
2160 ;; Scratch variables used in this function
2161 ref attr-name attr-value content-model content open-list
2163 ;; Scratch variables used by macros and defsubsts we call.
2164 (w3-p-s-let-bindings
2165 (w3-update-non-markup-chars)
2166 (setq w3-p-s-baseobject (copy-sequence url-current-object))
2167 ;; Main loop. Handle markup as follows:
2169 ;; non-empty tag: Handle the region since the previous tag as PCDATA,
2170 ;; RCDATA, CDATA, if allowed by syntax. Then handle the tag.
2172 ;; general entity (&name;): expand it and parse the result.
2174 ;; shortref (_, {, }, and ^ in math stuff): Expand it and parse the
2177 ;; SGML marked section (<![ keywords [ conditional-text ]]>): Either
2178 ;; strip the delimiters and parse the result or delete.
2182 ;; empty tag (<>, </>): Handle as the appropriate tag.
2184 ;; markup declaration (e.g. <!DOCTYPE ...>): Delete.
2186 ;; SGML processing instruction (<?name>): Delete.
2189 ;; Continue as long as we processed something last time and we
2190 ;; have more to process.
2192 (not (and (= last-loop-start (point))
2194 (setq last-loop-start (point)))
2196 ;; Display progress messages if asked and/or do incremental display
2198 (cond ((= 0 (% (setq loop-count (1+ loop-count)) 40))
2199 (if status-message-format
2200 (message status-message-format
2201 ;; Percentage of buffer processed.
2202 (/ (* (point) one-hundred) (point-max))))))
2204 ;; Go to next interesting thing in the buffer.
2205 (skip-chars-forward w3-p-d-non-markup-chars)
2207 ;; We are looking at a markup-starting character, and invalid
2208 ;; character, or end of buffer.
2211 ((eq ?< (char-after (point)))
2213 ;; We are looking at a tag, comment, markup declaration, SGML marked
2214 ;; section, SGML processing instruction, or non-markup "<".
2218 ;; jbw 2001-11-02: added possibility of of ":" in element
2219 ;; name to handle Microsoft-generated XHTML.
2220 ((looking-at "/?\\([a-z][-a-z0-9.:]*\\)")
2221 ;; We are looking at a non-empty tag.
2223 ;; Downcase it in the buffer, to save creation of a string
2224 (downcase-region (match-beginning 1) (match-end 1))
2225 (setq w3-p-d-tag-name
2226 (intern (buffer-substring (match-beginning 1)
2228 (setq w3-p-d-end-tag-p (eq ?/ (char-after (point)))
2229 between-tags-end (1- (point)))
2230 (goto-char (match-end 0))
2232 ;; Read the attributes from a start-tag.
2233 (if w3-p-d-end-tag-p
2234 (if (looking-at "[ \t\r\n/]*[<>]")
2236 ;; This is in here to deal with those idiots who stick
2237 ;; attribute/value pairs on end tags. *sigh*
2238 (w3-debug-html "Evil attributes on end tag.")
2239 (skip-chars-forward "^>"))
2241 ;; Attribute values can be:
2242 ;; "STRING" where STRING does not contain the double quote
2243 ;; 'STRING' where STRING does not contain the single quote
2244 ;; name-start character, *name character
2246 ;; Digit, +name character
2248 ;; or a SPACE-separated list of one of the last four
2249 ;; possibilities (there is a comment somewhere that this is a
2250 ;; misinterpretation of the grammar, so we ignore this
2256 ;; Leading whitespace.
2258 ;; The attribute name, possibly with a bad syntax
2260 ;; jbw 2001-11-02: added possibility of ":" to
2261 ;; next line to handle Microsoft-generated XHTML.
2262 "\\([a-z_][-a-z0-9.]*\\(\\([_:][-a-z0-9._:]*\\)?\\)\\)"
2263 ;; Trailing whitespace and perhaps an "=".
2264 "[ \n\r\t]*\\(\\(=[ \n\r\t]*\\)?\\)")))
2266 (cond ((/= (match-beginning 2) (match-end 2))
2269 (format "Bad attribute name syntax: %s"
2270 (buffer-substring (match-beginning 1)
2273 ;; Downcase it in the buffer, to save creation of a string
2274 (downcase-region (match-beginning 1) (match-end 1))
2276 (intern (buffer-substring (match-beginning 1)
2278 (goto-char (match-end 0))
2280 ((< (match-beginning 4) (match-end 4))
2281 ;; A value was specified (e.g. ATTRIBUTE=VALUE).
2286 ;; Comma separated list of literals with double quotes
2288 "\"\\([^\"]*\\(\"[ \n\r\t]*,[ \n\r\t]*\"[^\"]*\\)+\\)\""
2290 ;; Comma separated list of literals with single quotes
2292 "'\\([^']*\\('[ \n\r\t]*,[ \n\r\t]*'[^']*\\)+\\)'"
2294 ;; Literal with double quotes.
2297 ;; Literal with single quotes.
2300 ;; Handle bad HTML conflicting with NET-enabling
2302 "\\([^ \t\n\r>]+/[^ \t\n\r>]+\\)[ \t\n\r>]"
2304 ;; SGML NAME-syntax attribute value.
2305 "\\([-a-z0-9.]+\\)[ \t\n\r></]"
2308 ((or (match-beginning 5)
2311 (match-beginning 3))
2312 (if (or (match-beginning 1)
2313 (match-beginning 3))
2316 (format "Badly quoted attribute value: %s"
2318 ;; We have an attribute value literal.
2319 (narrow-to-region (1+ (match-beginning 0))
2321 ;; Delete (bad) extra quotes from comma separated list.
2323 ((match-beginning 1)
2324 (while (progn (skip-chars-forward "^\"") (not (eobp)))
2326 (goto-char (point-min)))
2327 ((match-beginning 3)
2328 (while (progn (skip-chars-forward "^'") (not (eobp)))
2330 (goto-char (point-min))))
2332 ;; In attribute value literals, EE and RS are ignored
2333 ;; and RE and SEPCHAR characters sequences are
2334 ;; replaced by SPACEs.
2336 ;; (There is no way right now to get RS into one of
2337 ;; these so that it can be ignored. This is due to
2338 ;; our using Unix line-handling conventions.)
2339 (skip-chars-forward "^&\t\n\r")
2342 ;; We must expand entities and replace RS, RE,
2344 (goto-char (point-min))
2346 (skip-chars-forward "^&")
2348 (w3-expand-entity-at-point-maybe))
2349 (subst-char-in-region (point-min) (point-max) ?\t ? )
2350 (subst-char-in-region (point-min) (point-max) ?\n ? ))
2351 ;; Set this after we have changed the size of the
2353 (setq attribute-value-end (1+ (point-max))))
2354 ((match-beginning 8)
2355 (setq attribute-value-end (match-end 8))
2356 (narrow-to-region (point) attribute-value-end))
2357 ((match-beginning 7)
2358 (setq attribute-value-end (match-end 7))
2359 (narrow-to-region (point) attribute-value-end)
2360 ;; Horribly illegal non-SGML handling of bad
2361 ;; HTML on the net. This can break valid HTML.
2362 (setq attr-value (buffer-substring (point)
2364 (w3-debug-html :nocontext
2365 (format "Evil attribute value syntax: %s"
2366 (buffer-substring (point-min) (point-max)))))
2368 (error "impossible attribute value"))))
2369 ((memq (char-after (point)) '(?\" ?'))
2370 ;; Missing terminating quote character.
2371 (narrow-to-region (point)
2374 (skip-chars-forward "^ \t\n\r'\"<>")
2375 (setq attribute-value-end (point))))
2376 (w3-debug-html :nocontext
2377 (format "Attribute value missing end quote: %s"
2378 (buffer-substring (point-min) (point-max))))
2379 (narrow-to-region (1+ (point-min)) (point-max)))
2381 ;; We have a syntactically invalid attribute value. Let's
2382 ;; make a best guess as to what the author intended.
2383 (narrow-to-region (point)
2385 (skip-chars-forward "^ \t\n\r'\"<>")
2386 (setq attribute-value-end (point))))
2387 (w3-debug-html :nocontext
2388 (format "Bad attribute value syntax: %s"
2389 (buffer-substring (point-min) (point-max))))))
2390 ;; Now we have isolated the attribute value. We need to
2391 ;; munge the value depending on the syntax of the
2393 ;; *** Right now, we only implement the necessary munging
2394 ;; for CDATA attributes, which is none. I'm not sure why
2395 ;; this happens to work for other attributes right now.
2396 ;; For any other kind of attribute, we are supposed to
2398 ;; * remove leading/trailing whitespace
2399 ;; * smash multiple space sequences into single spaces
2400 ;; * verify the syntax of each token
2401 (setq attr-value (buffer-substring (point-min) (point-max)))
2404 (setq attr-value (split-string attr-value "[ ,]+")))
2406 (if (string-match "^[ \t\r\n]*\\(.*\\)[ \t\r\n]*$"
2408 (setq attr-value (downcase
2409 (substring attr-value
2412 (setq attr-value (downcase attr-value)))
2413 (setq attr-value (intern attr-value)))
2415 ;; I should expand URLs here
2420 (goto-char attribute-value-end))
2422 ;; No value was specified, in which case NAME should be
2423 ;; taken as ATTRIBUTE=NAME where NAME is one of the
2424 ;; enumerated values for ATTRIBUTE.
2425 ;; We assume here that ATTRIBUTE is the same as NAME.
2426 ;; *** Another piece of code will fix the attribute name if it
2428 (setq attr-value (symbol-name attr-name))))
2430 ;; Accumulate the attributes.
2431 (setq tag-attributes (cons (cons attr-name attr-value)
2434 (if (and (eq w3-p-d-tag-name 'img)
2435 (not (assq 'alt tag-attributes)))
2436 (w3-debug-html :bad-style
2438 "IMG element has no ALT attribute"))
2440 ((and (eq w3-p-d-tag-name 'base)
2441 (setq w3-p-s-baseobject
2442 (or (assq 'src tag-attributes)
2443 (assq 'href tag-attributes))))
2444 (setq w3-p-s-baseobject (url-generic-parse-url
2445 (cdr w3-p-s-baseobject))))
2446 ((setq w3-p-s-btdt (or (assq 'src tag-attributes)
2447 (assq 'background tag-attributes)
2448 (assq 'codebase tag-attributes)
2449 (assq 'href tag-attributes)
2450 (assq 'action tag-attributes)))
2451 (setcdr w3-p-s-btdt (url-expand-file-name (cdr w3-p-s-btdt)
2453 (setq w3-p-s-btdt (if (url-have-visited-url (cdr w3-p-s-btdt))
2456 (if (assq 'class tag-attributes)
2457 (setcdr (assq 'class tag-attributes)
2459 (cdr (assq 'class tag-attributes))))
2460 (setq tag-attributes (cons (cons 'class (list w3-p-s-btdt))
2463 (if (not (eq w3-p-d-tag-name 'input))
2465 (setq w3-p-s-btdt (concat ":"
2468 (assq 'type tag-attributes))
2470 (if (assq 'class tag-attributes)
2471 (setcdr (assq 'class tag-attributes)
2473 (cdr (assq 'class tag-attributes))))
2474 (setq tag-attributes (cons (cons 'class (list w3-p-s-btdt))
2478 ;; Process the end of the tag.
2479 (skip-chars-forward " \t\n\r")
2480 (cond ((eq ?> (char-after (point)))
2481 ;; Ordinary tag end.
2483 ;; jbw 2001-06-25: added next sexp to make XHTML
2484 ;; masquerading as HTML work. This is a crude
2485 ;; disgusting hack which happens to make many of
2486 ;; the common cases work. One thing it does not
2487 ;; handle is if the input contains <br></br> which
2488 ;; is legal XHTML. Probably to handle that we need
2489 ;; to set a flag if we see an XML declaration and
2490 ;; then treat the EMPTY content model differently
2494 (or ;; XHTML-style empty tag
2495 (let ((html-element-info (get w3-p-d-tag-name 'html-element-info)))
2496 (and html-element-info
2498 (w3-element-content-model
2499 html-element-info))))
2500 ;; XHTML empty element which is not ordinarily
2501 ;; empty. Simulate by inserting an end tag.
2503 (insert "</" (symbol-name w3-p-d-tag-name) ">"))))
2504 ((and (eq ?/ (char-after (point)))
2505 (not w3-p-d-end-tag-p))
2506 ;; This is a NET-enabling start-tag.
2509 ((eq ?< (char-after (point)))
2510 ;; *** Strictly speaking, the following text has to
2511 ;; lexically be STAGO or ETAGO, which means that it
2512 ;; can't match some other lexical unit.
2518 (format "Bad unclosed %s%s tag"
2519 (if w3-p-d-end-tag-p "/" "")
2520 (w3-sgml-name-to-string w3-p-d-tag-name)))))
2522 (setq tag-end (point)))
2525 ;; We are looking at an empty tag (<>, </>).
2526 (setq w3-p-d-end-tag-p (eq ?/ (char-after (point))))
2527 (setq w3-p-d-tag-name (if w3-p-d-end-tag-p
2528 (w3-element-name w3-p-d-current-element)
2529 ;; *** Strictly speaking, if OMITTAG NO, then
2530 ;; we should use the most recently closed tag.
2531 ;; But OMITTAG YES in HTML and I'm lazy.
2532 (w3-element-name w3-p-d-current-element)))
2533 (setq tag-attributes nil)
2534 ;; *** Make sure this is not at top level.
2535 (setq between-tags-end (1- (point)))
2536 (setq tag-end (match-end 0)))
2538 ;; *** In SGML, <(doctype)element> is valid tag syntax. This
2539 ;; cannot occur in HTML because the CONCUR option is off in the
2540 ;; SGML declaration.
2543 ;; We found a comment, delete to end of comment.
2548 ;; Skip over pairs of -- ... --.
2550 ;; This can cause us to hit a stack overflow in the regexp
2551 ;; engine. And I'm not sure its correct anyway. Lets just
2552 ;; always fall back to the (semi) non-SGML way of dealing
2553 ;; with comments. WMP 12/24/97
2554 ;;; (if (looking-at "\\(--[^-]*\\(-[^-]+\\)*--[ \t\r\n]*\\)+>")
2555 ;;; (goto-char (match-end 0))
2556 ;;; ;; Syntax error!
2558 ;;; "Bad comment (unterminated or unbalanced \"--\" pairs)")
2559 ;;; (forward-char 2)
2560 ;;; (or (re-search-forward "--[ \t\r\n]*>" nil t)
2561 ;;; (search-forward ">" nil t)))
2563 (or (re-search-forward "--[ \t\r\n]*>" nil t)
2564 (search-forward ">" nil t))
2567 ((looking-at "!>\\|\\?[^>]*>")
2568 ;; We are looking at an empty comment or a processing
2569 ;; instruction. Delete it.
2573 ((looking-at "![a-z]")
2574 ;; We are looking at a markup declaration. Delete it.
2575 ;; *** Technically speaking, to handle valid HTML I think we
2576 ;; need to handle "<!USEMAP ... >" declarations. In the future,
2577 ;; to handle general SGML, we should parse "<!DOCTYPE ... >"
2578 ;; declarations as well (which can contain other declarations).
2579 ;; In the very distant future, perhaps we will handle "<!SGML
2580 ;; ... >" declarations.
2581 ;; *** Should warn if it's not SGML, DOCTYPE, or USEMAP.
2589 ;; *** This might not actually be bad syntax, but might
2590 ;; instead be a -- ... -- comment with unbalanced
2591 ;; parentheses somewhere inside the declaration. Handling
2592 ;; this properly would require full parsing of markup
2593 ;; declarations, a goal for the future.
2594 (w3-debug-html "Bad <! syntax.")
2595 (skip-chars-forward "^>")
2596 (if (eq ?> (char-after (point)))
2600 ((looking-at "!\\\[\\(\\([ \t\n\r]*[a-z]+\\)+[ \t\n\r]*\\)\\\[")
2601 ;; We are looking at a marked section.
2602 ;; *** Strictly speaking, we should issue a warning if the
2603 ;; keywords are invalid or missing or if the "[" does not follow.
2604 ;; We must look at the keywords to understand how to parse it.
2605 ;; *** Strictly speaking, we should perform parameter entity
2606 ;; substitution on the keywords first.
2607 (goto-char (match-beginning 1))
2609 (goto-char (1- (match-beginning 0)))
2613 (let* ((keywords (read (current-buffer)))
2614 ;; Multiple keywords may appear, but only the most
2615 ;; significant takes effect. Rank order is IGNORE, CDATA,
2616 ;; RCDATA, INCLUDE, and TEMP. INCLUDE and TEMP have the
2618 (keyword (car-safe (cond ((memq 'IGNORE keywords))
2619 ((memq 'CDATA keywords))
2620 ((memq 'RCDATA keywords))
2621 ((memq 'INCLUDE keywords))
2622 ((memq 'TEMP keywords))))))
2623 (or (eq ?\[ (char-after (point)))
2624 ;; I probably shouldn't even check this, since it is so
2626 (error "impossible ??"))
2628 (delete-region (1- (match-beginning 0)) (point))
2629 (cond ((eq 'IGNORE keyword)
2630 ;; Scan forward skipping over matching <![ ... ]]>
2631 ;; until we find an unmatched "]]>".
2632 (let ((ignore-nesting 1)
2633 (start-pos (point)))
2634 (while (> ignore-nesting 0)
2635 (if (re-search-forward "<!\\\\\[\\|\]\]>" nil t)
2636 (setq ignore-nesting
2637 (if (eq ?> (preceding-char))
2639 (1+ ignore-nesting)))
2641 "Unterminated IGNORE marked section.")
2642 (setq ignore-nesting 0)
2643 (goto-char start-pos)))
2644 (delete-region start-pos (point))))
2645 ((eq 'CDATA keyword)
2646 (error "***unimplemented***"))
2647 ((eq 'RCDATA keyword)
2648 (error "***unimplemented***"))
2649 ((memq keyword '(INCLUDE TEMP))
2650 (error "***unimplemented***")))))
2651 ((and (looking-at "!")
2652 w3-netscape-compatible-comments)
2653 ;; Horribly illegal non-SGML handling of bad HTML on the net.
2654 ;; This can break valid HTML.
2655 ;; This arises because Netscape discards anything looking like
2656 ;; "<!...>". So people expect they can use this construct as
2658 (w3-debug-html "Evil <! comment syntax.")
2663 (skip-chars-forward "^>")
2664 (if (eq ?> (char-after (point)))
2668 ;; This < is not a markup character. Pretend we didn't notice
2669 ;; it at all. We have skipped over the < already, so just loop
2673 ((eq ?& (char-after (point)))
2674 (w3-expand-entity-at-point-maybe))
2676 ((and (eq ?\] (char-after (point)))
2677 w3-p-d-in-parsed-marked-section
2679 ;; *** handle the end of a parsed marked section.
2680 (error "***unimplemented***"))
2682 ((and (eq ?/ (char-after (point)))
2683 w3-p-d-null-end-tag-enabled)
2684 ;; We are looking at a null end tag.
2685 (setq w3-p-d-end-tag-p t)
2686 (setq between-tags-end (point))
2687 (setq tag-end (1+ (point)))
2688 (setq w3-p-d-tag-name (w3-element-name w3-p-d-current-element)))
2690 ;; This can be slow, since we'll hardly ever get here.
2691 ;; *** Strictly speaking, I think we're supposed to handle
2692 ;; shortrefs that begin with the same characters as other markup,
2693 ;; preferring the longest match.
2694 ;; I will assume that shortrefs never begin with <, &, \], /.
2695 ((setq ref (catch 'found-shortref
2696 (let ((refs w3-p-d-shortrefs))
2698 (if (looking-at (car (car refs)))
2699 (throw 'found-shortref (cdr (car refs))))
2700 (setq refs (cdr refs))))))
2701 ;; We are looking at a shortref for which there is an
2702 ;; expansion defined in the current syntax. Replace with the
2703 ;; expansion, leaving point at the beginning so it will be parsed
2704 ;; on the next loop.
2705 ;; *** eek. This is wrong if the shortref is for an entity with
2706 ;; CDATA syntax which should not be reparsed for tags.
2712 ((looking-at (eval-when-compile
2713 (concat "[" (w3-invalid-sgml-chars) "]")))
2715 (format "Invalid SGML character: %c" (char-after (point))))
2716 ;; Probably cp1252 or some such without proper MIME spec...
2717 (insert (w3-resolve-numeric-char
2718 (w3-char-int (char-after (point)))))
2721 ;; We have finished the buffer. Make sure we process the last
2722 ;; piece of text, if any.
2723 (setq between-tags-end (point))
2724 ;; We have to test what's on the element stack because this
2725 ;; piece of code gets executed twice.
2726 (cond ((not (eq '*holder (w3-element-name w3-p-d-current-element)))
2727 ;; This forces the calculation of implied omitted end tags.
2728 (setq w3-p-d-tag-name '*document)
2729 (setq w3-p-d-end-tag-p t)
2730 (setq tag-end (point)))))
2733 (error "unreachable code, this can't happen")))
2735 ;; If we have determined the boundaries of a non-empty between-tags
2736 ;; region of text, then handle it.
2740 ((< between-tags-start between-tags-end)
2741 ;; We have a non-empty between-tags region.
2743 ;; We check if it's entirely whitespace, because we record the
2744 ;; transitions for whitespace separately from those for
2745 ;; data with non-whitespace characters.
2746 (goto-char between-tags-start)
2747 (skip-chars-forward " \t\n\r" between-tags-end)
2749 ((w3-grok-tag-or-data (prog1
2750 (if (= between-tags-end (point))
2753 (goto-char between-tags-end)))
2754 ;; We have to include the text in the current element's
2755 ;; contents. If this is the first item in the current
2756 ;; element's contents, don't include a leading newline if
2757 ;; there is one. Add a trailing newline as a separate text
2758 ;; item so that it can be removed later if it turns out to
2759 ;; be the last item in the current element's contents when
2760 ;; the current element is closed.
2761 ;; *** We could perform this test before calling
2762 ;; w3-grok-tag-or-data, but it's not clear which will be
2763 ;; faster in practice.
2764 (or (setq content (w3-element-content w3-p-d-current-element))
2765 ;; *** Strictly speaking, in SGML the record end is
2766 ;; carriage return, not line feed.
2767 (if (eq ?\n (char-after between-tags-start))
2768 (setq between-tags-start (1+ between-tags-start))))
2769 (if (= between-tags-start (point))
2772 ;; We are definitely going to add data characters to the
2775 ((and (= ?\n (preceding-char))
2776 (/= between-tags-start (1- (point))))
2777 (setq content (cons (buffer-substring between-tags-start
2780 (setq content (cons "\n" content)))
2782 (setq content (cons (buffer-substring between-tags-start
2785 (w3-set-element-content w3-p-d-current-element content))))))
2787 (setq between-tags-end nil)))
2789 ;; If the previous expression modified (point), then it went to
2790 ;; the value of between-tags-end.
2792 ;; If we found a start or end-tag, we need to handle it.
2796 ;; Move past the tag and prepare for next between-tags region.
2798 (setq between-tags-start (point))
2802 ;; Handle an end-tag.
2803 (if (eq w3-p-d-tag-name (w3-element-name w3-p-d-current-element))
2805 ;; Handle the complex version. We have to search up (down?)
2806 ;; the open element stack to find the element that matches (if
2807 ;; any). Then we close all of the elements. On a conforming
2808 ;; SGML document this can do no wrong and it's not
2809 ;; unreasonable on a non-conforming document.
2811 ;; Can't safely modify stack until we know the element we want
2812 ;; to find is in there, so work with a copy.
2813 (setq open-list w3-p-d-open-element-stack)
2814 (while (and open-list
2815 (not (eq w3-p-d-tag-name
2816 (w3-element-name (car open-list)))))
2817 (setq open-list (cdr open-list)))
2819 ;; We found a match. Pop elements.
2820 ;; We will use the following value as a sentinel.
2821 (setq open-list (cdr open-list))
2822 (while (not (eq open-list w3-p-d-open-element-stack))
2823 (w3-close-element t))
2828 (format "Unmatched end-tag </%s>"
2829 (w3-sgml-name-to-string w3-p-d-tag-name)))))))
2831 ;; Handle a start-tag.
2833 ;; Check if the new element is allowed in the current element's
2835 ((w3-grok-tag-or-data w3-p-d-tag-name)
2836 (w3-open-element w3-p-d-tag-name tag-attributes)
2838 ;; Handle NET-enabling start tags.
2839 (cond ((and net-tag-p
2840 (not w3-p-d-null-end-tag-enabled))
2842 (w3-set-element-undo-list
2843 w3-p-d-current-element
2844 (cons (cons 'w3-p-d-non-markup-chars
2845 w3-p-d-non-markup-chars)
2846 (cons '(w3-p-d-null-end-tag-enabled . nil)
2847 (w3-element-undo-list w3-p-d-current-element))))
2849 (setq w3-p-d-null-end-tag-enabled t)
2850 (w3-update-non-markup-chars)))
2853 (w3-element-content-model w3-p-d-current-element))
2855 ;; If the element does not have parsed contents, then we
2856 ;; can find its contents immediately.
2858 ((memq content-model '(EMPTY CDATA XCDATA XXCDATA RCDATA))
2860 ((eq 'EMPTY content-model)
2862 ((eq 'CDATA content-model)
2863 ;; CDATA: all data characters until an end-tag. We'll
2864 ;; process the end-tag on the next loop.
2865 (if (re-search-forward (if w3-p-d-null-end-tag-enabled
2869 (goto-char (match-beginning 0))))
2870 ((eq 'XCDATA content-model)
2871 ;; XCDATA: special non-SGML-standard mode which includes
2872 ;; all data characters until "</foo" is seen where "foo"
2873 ;; is the name of this element (for XMP and LISTING).
2875 (concat "</" (symbol-name
2876 (w3-element-name w3-p-d-current-element)))
2878 (goto-char (match-beginning 0))))
2879 ((eq 'XXCDATA content-model)
2880 ;; XXCDATA: special non-SGML-standard mode which includes
2881 ;; all data until end-of-entity (end-of-buffer for us)
2883 (goto-char (point-max)))
2884 ((eq 'RCDATA content-model)
2885 ;; RCDATA: all data characters until end-tag is seen,
2886 ;; except that entities are expanded first, although the
2887 ;; expansions are _not_ scanned for end-tags, although the
2888 ;; expansions _are_ scanned for further entity
2891 (if (re-search-forward (if w3-p-d-null-end-tag-enabled
2895 (goto-char (match-beginning 0)))
2896 (eq ?& (char-after (point))))
2897 (w3-expand-entity-at-point-maybe)))))))
2899 ;; The element is illegal here. We'll just discard the start
2900 ;; tag as though we never saw it.
2903 (setq w3-p-d-tag-name nil)
2904 (setq w3-p-d-end-tag-p nil)
2905 (setq net-tag-p nil)
2906 (setq tag-attributes nil)
2907 (setq tag-end nil)))
2909 ;; End of main while loop.
2912 ;; We have finished parsing the buffer!
2913 (if status-message-format
2914 (message "%sdone" (format status-message-format 100)))
2916 ;; *** For debugging, save the true parse tree.
2917 ;; *** Make this look inside *DOCUMENT.
2918 (setq w3-last-parse-tree
2919 (w3-element-content w3-p-d-current-element))
2921 (set-syntax-table old-syntax-table)
2922 (w3-element-content w3-p-d-current-element)
2925 (require 'w3-fast-parse)
2927 (defun w3-parse-buffer (&optional buff)
2928 "Parse contents of BUFF as HTML.
2929 BUFF defaults to the current buffer.
2930 Destructively alters contents of BUFF.
2931 Returns a data structure containing the parsed information."
2932 (if nil ;; (w3-fast-parse-find-tidy-program)
2933 (fset 'w3-parse-buffer 'w3-fast-parse-buffer)
2934 (fset 'w3-parse-buffer 'w3-slow-parse-buffer))
2935 (w3-parse-buffer buff))
2942 ;; indent-tabs-mode: nil