;;; html2text.el --- a simple html to plain text converter
-;; Copyright (C) 2002 Free Software Foundation, Inc.
+;; Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
;; Author: Joakim Hove <hove@phys.ntnu.no>
;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs; see the file COPYING. If not, write to the
-;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-;; Boston, MA 02111-1307, USA.
+;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+;; Boston, MA 02110-1301, USA.
;;; Commentary:
;; These functions provide a simple way to wash/clean html infected
;; mails. Definitely do not work in all cases, but some improvement
-;; in readability is generally obtained. Formatting is only done in
+;; in readability is generally obtained. Formatting is only done in
;; the buffer, so the next time you enter the article it will be
;; "re-htmlized".
;;
-;; The main function is "html2text"
+;; The main function is `html2text'.
;;; Code:
(defvar html2text-format-single-element-list '(("hr" . html2text-clean-hr)))
(defvar html2text-replace-list
- '((" " . " ") (">" . ">") ("<" . "<") (""" . "\""))
+ '(("´" . "`")
+ ("&" . "&")
+ ("'" . "'")
+ ("¦" . "|")
+ ("¢" . "c")
+ ("ˆ" . "^")
+ ("©" . "(C)")
+ ("¤" . "(#)")
+ ("°" . "degree")
+ ("÷" . "/")
+ ("€" . "e")
+ ("½" . "1/2")
+ (">" . ">")
+ ("¿" . "?")
+ ("«" . "<<")
+ ("&ldquo" . "\"")
+ ("‹" . "(")
+ ("‘" . "`")
+ ("<" . "<")
+ ("—" . "--")
+ (" " . " ")
+ ("–" . "-")
+ ("‰" . "%%")
+ ("±" . "+-")
+ ("£" . "£")
+ (""" . "\"")
+ ("»" . ">>")
+ ("&rdquo" . "\"")
+ ("®" . "(R)")
+ ("›" . ")")
+ ("’" . "'")
+ ("§" . "§")
+ ("¹" . "^1")
+ ("²" . "^2")
+ ("³" . "^3")
+ ("˜" . "~"))
"The map of entity to text.
This is an alist were each element is a dotted pair consisting of an
-old string, and a replacement string. This replacement is done by the
-function \"html2text-substitute\" which basically performs a
-replace-string operation for every element in the list. This is
+old string, and a replacement string. This replacement is done by the
+function `html2text-substitute' which basically performs a
+`replace-string' operation for every element in the list. This is
completely verbatim - without any use of REGEXP.")
(defvar html2text-remove-tag-list
"A list of removable tags.
This is a list of tags which should be removed, without any
-formatting. Observe that if you the tags in the list are presented
-*without* any \"<\" or \">\". All occurences of a tag appearing in
-this list are removed, irrespective of whether it is a closing or
-opening tag, or if the tag has additional attributes. The actual
-deletion is done by the function \"html2text-remove-tags\".
+formatting. Note that tags in the list are presented *without*
+any \"<\" or \">\". All occurrences of a tag appearing in this
+list are removed, irrespective of whether it is a closing or
+opening tag, or if the tag has additional attributes. The
+deletion is done by the function `html2text-remove-tags'.
For instance the text:
(defvar html2text-format-tag-list
'(("b" . html2text-clean-bold)
+ ("strong" . html2text-clean-bold)
("u" . html2text-clean-underline)
("i" . html2text-clean-italic)
+ ("em" . html2text-clean-italic)
("blockquote" . html2text-clean-blockquote)
("a" . html2text-clean-anchor)
("ul" . html2text-clean-ul)
"An alist of tags and processing functions.
This is an alist where each dotted pair consists of a tag, and then
-the name of a function to be called when this tag is found. The
+the name of a function to be called when this tag is found. The
function is called with the arguments p1, p2, p3 and p4. These are
demontrated below:
;; <Utility functions>
;;
-(defun html2text-buffer-head ()
- (if (string= mode-name "Article")
- (beginning-of-buffer)
- (beginning-of-buffer)
- )
- )
-(defun html2text-replace-string (from-string to-string p1 p2)
- (goto-char p1)
+(defun html2text-replace-string (from-string to-string min max)
+ "Replace FROM-STRING with TO-STRING in region from MIN to MAX."
+ (goto-char min)
(let ((delta (- (string-width to-string) (string-width from-string)))
(change 0))
- (while (search-forward from-string p2 t)
+ (while (search-forward from-string max t)
(replace-match to-string)
- (setq change (+ change delta))
- )
- change
- )
- )
+ (setq change (+ change delta)))
+ change))
;;
;; </Utility functions>
;; <Functions related to attributes> i.e. <font size=+3>
;;
-(defun html2text-attr-value (attr-list attr)
- (nth 1 (assoc attr attr-list))
- )
+(defun html2text-attr-value (list attribute)
+ "Get value of ATTRIBUTE from LIST."
+ (nth 1 (assoc attribute list)))
-(defun html2text-get-attr (p1 p2 tag)
+(defun html2text-get-attr (p1 p2)
(goto-char p1)
(re-search-forward " +[^ ]" p2 t)
(let* ((attr-string (buffer-substring-no-properties (1- (point)) (1- p2)))
((string-match "[^ ]=[^ ]" prev)
(let ((attr (nth 0 (split-string prev "=")))
(value (nth 1 (split-string p