X-Git-Url: http://cgit.sxemacs.org/?a=blobdiff_plain;f=lisp%2Fhtml2text.el;h=47c2e1cae69b35b2e295bf7fd667482fe86bbbd9;hb=ee19f3e2ca78414ec186ece12d3580f7a7af206d;hp=f76cb305634eb81b01545b1c8140e61a7170c15c;hpb=e5b801c287295e0df6ecde6c91ae715ef8bd4f0a;p=gnus diff --git a/lisp/html2text.el b/lisp/html2text.el index f76cb3056..47c2e1cae 100644 --- a/lisp/html2text.el +++ b/lisp/html2text.el @@ -1,6 +1,5 @@ ;;; html2text.el --- a simple html to plain text converter - -;; Copyright (C) 2002 Free Software Foundation, Inc. +;; Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc. ;; Author: Joakim Hove @@ -43,7 +42,8 @@ (defvar html2text-format-single-element-list '(("hr" . html2text-clean-hr))) (defvar html2text-replace-list - '((" " . " ") (">" . ">") ("<" . "<") (""" . "\"")) + '((" " . " ") (">" . ">") ("<" . "<") (""" . "\"") + ("&" . "&") ("'" . "'")) "The map of entity to text. This is an alist were each element is a dotted pair consisting of an @@ -119,9 +119,7 @@ formatting, and then moved afterward.") (defun html2text-buffer-head () (if (string= mode-name "Article") (beginning-of-buffer) - (beginning-of-buffer) - ) - ) + (beginning-of-buffer))) (defun html2text-replace-string (from-string to-string p1 p2) (goto-char p1) @@ -129,11 +127,8 @@ formatting, and then moved afterward.") (change 0)) (while (search-forward from-string p2 t) (replace-match to-string) - (setq change (+ change delta)) - ) - change - ) - ) + (setq change (+ change delta))) + change)) ;; ;; @@ -147,108 +142,26 @@ formatting, and then moved afterward.") ;; (defun html2text-attr-value (attr-list attr) - (nth 1 (assoc attr attr-list)) - ) - -(defun html2text-get-attr (p1 p2 tag) - (goto-char p1) - (re-search-forward " +[^ ]" p2 t) - (let* ((attr-string (buffer-substring-no-properties (1- (point)) (1- p2))) - (tmp-list (split-string attr-string)) - (attr-list) - (counter 0) - (prev (car tmp-list)) - (this (nth 1 tmp-list)) - (next (nth 2 tmp-list)) - (index 1)) - - (cond - ;; size=3 - ((string-match "[^ ]=[^ ]" prev) - (let ((attr (nth 0 (split-string prev "="))) - (value (nth 1 (split-string prev "=")))) - (setq attr-list (cons (list attr value) attr-list)) - ) - ) - ;; size= 3 - ((string-match "[^ ]=\\'" prev) - (setq attr-list (cons (list (substring prev 0 -1) this) attr-list)) - ) - ) - - (while (< index (length tmp-list)) - (cond - ;; size=3 - ((string-match "[^ ]=[^ ]" this) - (let ((attr (nth 0 (split-string this "="))) - (value (nth 1 (split-string this "=")))) - (setq attr-list (cons (list attr value) attr-list)) - ) - ) - ;; size =3 - ((string-match "\\`=[^ ]" this) - (setq attr-list (cons (list prev (substring this 1)) attr-list))) - - ;; size= 3 - ((string-match "[^ ]=\\'" this) - (setq attr-list (cons (list (substring this 0 -1) next) attr-list)) - ) - - ;; size = 3 - ((string= "=" this) - (setq attr-list (cons (list prev next) attr-list)) - ) - ) - (setq index (1+ index)) - (setq prev this) - (setq this next) - (setq next (nth (1+ index) tmp-list)) - ) - - ;; - ;; Tags with no accompanying "=" i.e. value=nil - ;; - (setq prev (car tmp-list)) - (setq this (nth 1 tmp-list)) - (setq next (nth 2 tmp-list)) - (setq index 1) - - (if (not (string-match "=" prev)) - (progn - (if (not (string= (substring this 0 1) "=")) - (setq attr-list (cons (list prev nil) attr-list)) - ) - ) - ) - - (while (< index (1- (length tmp-list))) - (if (not (string-match "=" this)) - (if (not (or (string= (substring next 0 1) "=") - (string= (substring prev -1) "="))) - (setq attr-list (cons (list this nil) attr-list)) - ) - ) - (setq index (1+ index)) - (setq prev this) - (setq this next) - (setq next (nth (1+ index) tmp-list)) - ) - - (if this - (progn - (if (not (string-match "=" this)) - (progn - (if (not (string= (substring prev -1) "=")) - (setq attr-list (cons (list this nil) attr-list)) - ) - ) - ) - ) - ) - attr-list ;; return - value - ) - ) - + (nth 1 (assoc attr attr-list))) + +(defun html2text-get-attr (p1 p2) + (save-restriction + (narrow-to-region p1 p2) + (let (result) + (goto-char (point-min)) + (while (not (eobp)) + (when (re-search-forward "[^= ]+" nil t) + (push + (list + (match-string 0) + (when (looking-at " *= *") + (goto-char (match-end 0)) + (buffer-substring + (point) + (goto-char (or (ignore-errors (scan-sexps (point) 1)) + (point-max)))))) + result))) + result))) ;; ;; ;; @@ -272,10 +185,7 @@ formatting, and then moved afterward.") (cond ((string= list-type "ul") (insert " o ")) ((string= list-type "ol") (insert (format " %s: " item-nr))) - (t (insert " x "))) - ) - ) - ) + (t (insert " x ")))))) (defun html2text-clean-dtdd (p1 p2) (goto-char p1) @@ -287,9 +197,8 @@ formatting, and then moved afterward.") (while (< item-nr items) (setq item-nr (1+ item-nr)) (re-search-forward "
\\([ ]*\\)" (point-max) t) - (if (match-string 1) - (kill-region (point) (- (point) (string-width (match-string 1)))) - ) + (when (match-string 1) + (delete-region (point) (- (point) (string-width (match-string 1))))) (let ((def-p1 (point)) (def-p2 0)) (re-search-forward "\\([ ]*\\)\\(
\\|
\\)" (point-max) t) @@ -299,92 +208,71 @@ formatting, and then moved afterward.") (mw2 (string-width (match-string 2))) (mw (+ mw1 mw2))) (goto-char (- (point) mw)) - (kill-region (point) (+ (point) mw1)) - (setq def-p2 (point)) - ) - ) + (delete-region (point) (+ (point) mw1)) + (setq def-p2 (point)))) (setq def-p2 (- (point) (string-width (match-string 2))))) - (put-text-property def-p1 def-p2 'face 'bold) - ) - ) - ) - ) + (put-text-property def-p1 def-p2 'face 'bold))))) (defun html2text-delete-tags (p1 p2 p3 p4) - (kill-region p1 p2) - (kill-region (- p3 (- p2 p1)) (- p4 (- p2 p1))) - ) + (delete-region p1 p2) + (delete-region (- p3 (- p2 p1)) (- p4 (- p2 p1)))) (defun html2text-delete-single-tag (p1 p2) - (kill-region p1 p2) - ) + (delete-region p1 p2)) (defun html2text-clean-hr (p1 p2) (html2text-delete-single-tag p1 p2) (goto-char p1) (newline 1) - (insert (make-string fill-column ?-)) - ) + (insert (make-string fill-column ?-))) (defun html2text-clean-ul (p1 p2 p3 p4) (html2text-delete-tags p1 p2 p3 p4) - (html2text-clean-list-items p1 (- p3 (- p1 p2)) "ul") - ) + (html2text-clean-list-items p1 (- p3 (- p1 p2)) "ul")) (defun html2text-clean-ol (p1 p2 p3 p4) (html2text-delete-tags p1 p2 p3 p4) - (html2text-clean-list-items p1 (- p3 (- p1 p2)) "ol") - ) + (html2text-clean-list-items p1 (- p3 (- p1 p2)) "ol")) (defun html2text-clean-dl (p1 p2 p3 p4) (html2text-delete-tags p1 p2 p3 p4) - (html2text-clean-dtdd p1 (- p3 (- p1 p2))) - ) + (html2text-clean-dtdd p1 (- p3 (- p1 p2)))) (defun html2text-clean-center (p1 p2 p3 p4) (html2text-delete-tags p1 p2 p3 p4) - (center-region p1 (- p3 (- p2 p1))) - ) + (center-region p1 (- p3 (- p2 p1)))) (defun html2text-clean-bold (p1 p2 p3 p4) (put-text-property p2 p3 'face 'bold) - (html2text-delete-tags p1 p2 p3 p4) - ) + (html2text-delete-tags p1 p2 p3 p4)) (defun html2text-clean-title (p1 p2 p3 p4) (put-text-property p2 p3 'face 'bold) - (html2text-delete-tags p1 p2 p3 p4) - ) + (html2text-delete-tags p1 p2 p3 p4)) (defun html2text-clean-underline (p1 p2 p3 p4) (put-text-property p2 p3 'face 'underline) - (html2text-delete-tags p1 p2 p3 p4) - ) + (html2text-delete-tags p1 p2 p3 p4)) (defun html2text-clean-italic (p1 p2 p3 p4) (put-text-property p2 p3 'face 'italic) - (html2text-delete-tags p1 p2 p3 p4) - ) + (html2text-delete-tags p1 p2 p3 p4)) (defun html2text-clean-font (p1 p2 p3 p4) - (html2text-delete-tags p1 p2 p3 p4) - ) + (html2text-delete-tags p1 p2 p3 p4)) (defun html2text-clean-blockquote (p1 p2 p3 p4) - (html2text-delete-tags p1 p2 p3 p4) - ) + (html2text-delete-tags p1 p2 p3 p4)) (defun html2text-clean-anchor (p1 p2 p3 p4) - ;; If someone can explain how to make the URL clickable - ;; I will surely improve upon this. - (let* ((attr-list (html2text-get-attr p1 p2 "a")) - (href (html2text-attr-value attr-list "href"))) - (kill-region p1 p4) - (goto-char p1) - (insert (substring href 1 -1 )) - (put-text-property p1 (point) 'face 'bold) - ) - ) + ;; If someone can explain how to make the URL clickable I will + ;; surely improve upon this. + (let ((href (html2text-attr-value (html2text-get-attr p1 p2) "href"))) + (delete-region p1 p4) + (when href + (goto-char p1) + (insert (substring href 1 -1 )) + (put-text-property p1 (point) 'face 'bold)))) ;; ;; @@ -402,38 +290,29 @@ formatting, and then moved afterward.") (let ((has-br-line) (refill-start) (refill-stop)) - (if (re-search-forward "
$" p2 t) - (setq has-br-line t) - ) - (if has-br-line - (progn - (goto-char p1) - (if (re-search-forward ".+[^<][^b][^r][^>]$" p2 t) - (progn - (beginning-of-line) - (setq refill-start (point)) - (goto-char p2) - (re-search-backward ".+[^<][^b][^r][^>]$" refill-start t) - (next-line 1) - (end-of-line) - ;; refill-stop should ideally be adjusted to - ;; accomodate the "
" strings which are removed - ;; between refill-start and refill-stop. Can simply - ;; be returned from my-replace-string - (setq refill-stop (+ (point) - (html2text-replace-string - "
" "" - refill-start (point)))) - ;; (message "Point = %s refill-stop = %s" (point) refill-stop) - ;; (sleep-for 4) - (fill-region refill-start refill-stop) - ) - ) - ) - ) - ) - (html2text-replace-string "
" "" p1 p2) - ) + (when (re-search-forward "
$" p2 t) + (setq has-br-line t)) + (when has-br-line + (goto-char p1) + (when (re-search-forward ".+[^<][^b][^r][^>]$" p2 t) + (beginning-of-line) + (setq refill-start (point)) + (goto-char p2) + (re-search-backward ".+[^<][^b][^r][^>]$" refill-start t) + (next-line 1) + (end-of-line) + ;; refill-stop should ideally be adjusted to + ;; accomodate the "
" strings which are removed + ;; between refill-start and refill-stop. Can simply + ;; be returned from my-replace-string + (setq refill-stop (+ (point) + (html2text-replace-string + "
" "" + refill-start (point)))) + ;; (message "Point = %s refill-stop = %s" (point) refill-stop) + ;; (sleep-for 4) + (fill-region refill-start refill-stop)))) + (html2text-replace-string "
" "" p1 p2)) ;; ;; This one is interactive ... @@ -447,17 +326,14 @@ fashion, quite close to pure guess-work. It does work in some cases though." ;; Removing lonely
on a single line, if they are left intact we ;; dont have any paragraphs at all. (html2text-buffer-head) - (while (< (point) (point-max)) + (while (not (eobp)) (let ((p1 (point))) (forward-paragraph 1) ;;(message "Kaller fix med p1=%s p2=%s " p1 (1- (point))) (sleep-for 5) (html2text-fix-paragraph p1 (1- (point))) (goto-char p1) - (if (< (point) (point-max)) - (forward-paragraph 1)) - ) - ) - ) + (when (not (eobp)) + (forward-paragraph 1))))) ;; ;; @@ -476,14 +352,8 @@ See the documentation for that variable." (interactive) (dolist (tag tag-list) (html2text-buffer-head) - (while (re-search-forward (format "\\(]*>\\)" tag) (point-max) t) - (let ((p1 (point))) - (search-backward "<") - (kill-region (point) p1) - ) - ) - ) - ) + (while (re-search-forward (format "]*>" tag) (point-max) t) + (delete-region (match-beginning 0) (match-end 0))))) (defun html2text-format-tags () "See the variable \"html2text-format-tag-list\" for documentation" @@ -492,12 +362,12 @@ See the documentation for that variable." (let ((tag (car tag-and-function)) (function (cdr tag-and-function))) (html2text-buffer-head) - (while (re-search-forward (format "\\(<%s\\( [^>]*\\)?>\\)" tag) + (while (re-search-forward (format "<%s\\( [^>]*\\)?>" tag) (point-max) t) (let ((p1) (p2 (point)) (p3) (p4) - (attr (match-string 1))) + (attr (match-string 0))) (search-backward "<" (point-min) t) (setq p1 (point)) (re-search-forward (format "" tag) (point-max) t) @@ -506,11 +376,7 @@ See the documentation for that variable." (setq p3 (point)) (funcall function p1 p2 p3 p4) (goto-char p1) - ) - ) - ) - ) - ) + ))))) (defun html2text-substitute () "See the variable \"html2text-replace-list\" for documentation" @@ -519,10 +385,8 @@ See the documentation for that variable." (html2text-buffer-head) (let ((old-string (car e)) (new-string (cdr e))) - (html2text-replace-string old-string new-string (point-min) (point-max)) - ) - ) - ) + (html2text-replace-string old-string new-string (point-min) (point-max))) + )) (defun html2text-format-single-elements () "" @@ -531,18 +395,13 @@ See the documentation for that variable." (let ((tag (car tag-and-function)) (function (cdr tag-and-function))) (html2text-buffer-head) - (while (re-search-forward (format "\\(<%s\\( [^>]*\\)?>\\)" tag) + (while (re-search-forward (format "<%s\\( [^>]*\\)?>" tag) (point-max) t) (let ((p1) (p2 (point))) (search-backward "<" (point-min) t) (setq p1 (point)) - (funcall function p1 p2) - ) - ) - ) - ) - ) + (funcall function p1 p2)))))) ;; ;; Main function @@ -566,4 +425,5 @@ See the documentation for that variable." ;; ;; +;;; arch-tag: e9e57b79-35d4-4de1-a647-e7e01fe56d1e ;;; html2text.el ends here