X-Git-Url: http://cgit.sxemacs.org/?a=blobdiff_plain;f=lisp%2Fhtml2text.el;h=27dc599c9b1d8ceb3fcf7169bebbb465ff3a4d02;hb=c40056695d7dcc4e7c71707009bc077a18a3344d;hp=f33b1257369b21b8e1b86e1d12c5623013cd0dcf;hpb=96d4a7cc3dda58f46de9f03c18f91f8a59e4a8e5;p=gnus diff --git a/lisp/html2text.el b/lisp/html2text.el index f33b12573..27dc599c9 100644 --- a/lisp/html2text.el +++ b/lisp/html2text.el @@ -1,5 +1,6 @@ ;;; html2text.el --- a simple html to plain text converter -;; Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc. + +;; Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc. ;; Author: Joakim Hove @@ -17,8 +18,8 @@ ;; You should have received a copy of the GNU General Public License ;; along with GNU Emacs; see the file COPYING. If not, write to the -;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, -;; Boston, MA 02111-1307, USA. +;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +;; Boston, MA 02110-1301, USA. ;;; Commentary: @@ -42,8 +43,42 @@ (defvar html2text-format-single-element-list '(("hr" . html2text-clean-hr))) (defvar html2text-replace-list - '((" " . " ") (">" . ">") ("<" . "<") (""" . "\"") - ("&" . "&") ("'" . "'")) + '(("´" . "`") + ("&" . "&") + ("'" . "'") + ("¦" . "|") + ("¢" . "c") + ("ˆ" . "^") + ("©" . "(C)") + ("¤" . "(#)") + ("°" . "degree") + ("÷" . "/") + ("€" . "e") + ("½" . "1/2") + (">" . ">") + ("¿" . "?") + ("«" . "<<") + ("&ldquo" . "\"") + ("‹" . "(") + ("‘" . "`") + ("<" . "<") + ("—" . "--") + (" " . " ") + ("–" . "-") + ("‰" . "%%") + ("±" . "+-") + ("£" . "£") + (""" . "\"") + ("»" . ">>") + ("&rdquo" . "\"") + ("®" . "(R)") + ("›" . ")") + ("’" . "'") + ("§" . "§") + ("¹" . "^1") + ("²" . "^2") + ("³" . "^3") + ("˜" . "~")) "The map of entity to text. This is an alist were each element is a dotted pair consisting of an @@ -144,7 +179,7 @@ formatting, and then moved afterward.") "Get value of ATTRIBUTE from LIST." (nth 1 (assoc attribute list))) -(defun html2text-get-attr (p1 p2 tag) +(defun html2text-get-attr (p1 p2) (goto-char p1) (re-search-forward " +[^ ]" p2 t) (let* ((attr-string (buffer-substring-no-properties (1- (point)) (1- p2))) @@ -214,24 +249,6 @@ formatting, and then moved afterward.") ;; return - value attr-list)) -(defun html2text-get-attr (p1 p2) - (save-restriction - (narrow-to-region p1 p2) - (let (result) - (goto-char (point-min)) - (while (not (eobp)) - (when (re-search-forward "[^= ]+" nil t) - (push - (list - (match-string 0) - (when (looking-at " *= *") - (goto-char (match-end 0)) - (buffer-substring - (point) - (goto-char (or (ignore-errors (scan-sexps (point) 1)) - (point-max)))))) - result))) - result))) ;; ;; ;; @@ -338,7 +355,7 @@ formatting, and then moved afterward.") ;; If someone can explain how to make the URL clickable I will surely ;; improve upon this. ;; Maybe `goto-addr.el' can be used here. - (let* ((attr-list (html2text-get-attr p1 p2 "a")) + (let* ((attr-list (html2text-get-attr p1 p2)) (href (html2text-attr-value attr-list "href"))) (delete-region p1 p4) (when href @@ -391,7 +408,8 @@ formatting, and then moved afterward.") fashion, quite close to pure guess-work. It does work in some cases though." (interactive) (goto-char (point-min)) - (replace-regexp "^
$" "") + (while (re-search-forward "^
$" nil t) + (delete-region (match-beginning 0) (match-end 0))) ;; Removing lonely
on a single line, if they are left intact we ;; dont have any paragraphs at all. (goto-char (point-min)) @@ -435,8 +453,7 @@ See the documentation for that variable." (point-max) t) (let ((p1) (p2 (point)) - (p3) (p4) - (attr (match-string 0))) + (p3) (p4)) (search-backward "<" (point-min) t) (setq p1 (point)) (search-forward (format "" tag) (point-max) t)