1 ;;; html2text.el --- a simple html to plain text converter
3 ;; Copyright (C) 2002-2011 Free Software Foundation, Inc.
5 ;; Author: Joakim Hove <hove@phys.ntnu.no>
7 ;; This file is part of GNU Emacs.
9 ;; GNU Emacs is free software: you can redistribute it and/or modify
10 ;; it under the terms of the GNU General Public License as published by
11 ;; the Free Software Foundation, either version 3 of the License, or
12 ;; (at your option) any later version.
14 ;; GNU Emacs is distributed in the hope that it will be useful,
15 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;; GNU General Public License for more details.
19 ;; You should have received a copy of the GNU General Public License
20 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
24 ;; These functions provide a simple way to wash/clean html infected
25 ;; mails. Definitely do not work in all cases, but some improvement
26 ;; in readability is generally obtained. Formatting is only done in
27 ;; the buffer, so the next time you enter the article it will be
30 ;; The main function is `html2text'.
41 (defvar html2text-format-single-element-list '(("hr" . html2text-clean-hr)))
43 (defvar html2text-replace-list
80 "The map of entity to text.
82 This is an alist were each element is a dotted pair consisting of an
83 old string, and a replacement string. This replacement is done by the
84 function `html2text-substitute' which basically performs a
85 `replace-string' operation for every element in the list. This is
86 completely verbatim - without any use of REGEXP.")
88 (defvar html2text-remove-tag-list
89 '("html" "body" "p" "img" "dir" "head" "div" "br" "font" "title" "meta")
90 "A list of removable tags.
92 This is a list of tags which should be removed, without any
93 formatting. Note that tags in the list are presented *without*
94 any \"<\" or \">\". All occurrences of a tag appearing in this
95 list are removed, irrespective of whether it is a closing or
96 opening tag, or if the tag has additional attributes. The
97 deletion is done by the function `html2text-remove-tags'.
99 For instance the text:
101 \"Here comes something <font size\"+3\" face=\"Helvetica\"> big </font>.\"
105 \"Here comes something big.\"
107 If this list contains the element \"font\".")
109 (defvar html2text-format-tag-list
110 '(("b" . html2text-clean-bold)
111 ("strong" . html2text-clean-bold)
112 ("u" . html2text-clean-underline)
113 ("i" . html2text-clean-italic)
114 ("em" . html2text-clean-italic)
115 ("blockquote" . html2text-clean-blockquote)
116 ("a" . html2text-clean-anchor)
117 ("ul" . html2text-clean-ul)
118 ("ol" . html2text-clean-ol)
119 ("dl" . html2text-clean-dl)
120 ("center" . html2text-clean-center))
121 "An alist of tags and processing functions.
123 This is an alist where each dotted pair consists of a tag, and then
124 the name of a function to be called when this tag is found. The
125 function is called with the arguments p1, p2, p3 and p4. These are
128 \"<b> This is bold text </b>\"
133 Then the called function will typically format the text somewhat and
136 (defvar html2text-remove-tag-list2 '("li" "dt" "dd" "meta")
137 "Another list of removable tags.
139 This is a list of tags which are removed similarly to the list
140 `html2text-remove-tag-list' - but these tags are retained for the
141 formatting, and then moved afterward.")
144 ;; </Global variables>
147 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
148 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
151 ;; <Utility functions>
155 (defun html2text-replace-string (from-string to-string min max)
156 "Replace FROM-STRING with TO-STRING in region from MIN to MAX."
158 (let ((delta (- (string-width to-string) (string-width from-string)))
160 (while (search-forward from-string max t)
161 (replace-match to-string)
162 (setq change (+ change delta)))
166 ;; </Utility functions>
169 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
170 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
173 ;; <Functions related to attributes> i.e. <font size=+3>
176 (defun html2text-attr-value (list attribute)
177 "Get value of ATTRIBUTE from LIST."
178 (nth 1 (assoc attribute list)))
180 (defun html2text-get-attr (p1 p2)
182 (re-search-forward " +[^ ]" p2 t)
183 (let* ((attr-string (buffer-substring-no-properties (1- (point)) (1- p2)))
184 (tmp-list (split-string attr-string))
187 (prev (car tmp-list))
188 (this (nth 1 tmp-list))
189 (next (nth 2 tmp-list))
194 ((string-match "[^ ]=[^ ]" prev)
195 (let ((attr (nth 0 (split-string prev "=")))
196 (value (nth 1 (split-string prev "="))))
197 (setq attr-list (cons (list attr value) attr-list))))
199 ((string-match "[^ ]=\\'" prev)
200 (setq attr-list (cons (list (substring prev 0 -1) this) attr-list))))
202 (while (< index (length tmp-list))
205 ((string-match "[^ ]=[^ ]" this)
206 (let ((attr (nth 0 (split-string this "=")))
207 (value (nth 1 (split-string this "="))))
208 (setq attr-list (cons (list attr value) attr-list))))
210 ((string-match "\\`=[^ ]" this)
211 (setq attr-list (cons (list prev (substring this 1)) attr-list)))
213 ((string-match "[^ ]=\\'" this)
214 (setq attr-list (cons (list (substring this 0 -1) next) attr-list)))
217 (setq attr-list (cons (list prev next) attr-list))))
218 (setq index (1+ index))
221 (setq next (nth (1+ index) tmp-list)))
223 ;; Tags with no accompanying "=" i.e. value=nil
225 (setq prev (car tmp-list))
226 (setq this (nth 1 tmp-list))
227 (setq next (nth 2 tmp-list))
230 (when (and (not (string-match "=" prev))
231 (not (string= (substring this 0 1) "=")))
232 (setq attr-list (cons (list prev nil) attr-list)))
233 (while (< index (1- (length tmp-list)))
234 (when (and (not (string-match "=" this))
235 (not (or (string= (substring next 0 1) "=")
236 (string= (substring prev -1) "="))))
237 (setq attr-list (cons (list this nil) attr-list)))
238 (setq index (1+ index))
241 (setq next (nth (1+ index) tmp-list)))