X-Git-Url: http://cgit.sxemacs.org/?a=blobdiff_plain;f=lisp%2Frfc2047.el;h=b9b2e36536dd17f781a72b917aabcf1b9c9f60b7;hb=70df8dffbb57036113fb6b34e46d17eb1c57d1d1;hp=358ccf7dcb6f60b1f36bfc7eee914f351610e302;hpb=7d7727a9f1a6bf766c4685353caacf667cb91a54;p=gnus diff --git a/lisp/rfc2047.el b/lisp/rfc2047.el index 358ccf7dc..b9b2e3653 100644 --- a/lisp/rfc2047.el +++ b/lisp/rfc2047.el @@ -1,5 +1,5 @@ -;;; rfc2047.el --- Functions for encoding and decoding rfc2047 messages -;; Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc. +;;; rfc2047.el --- functions for encoding and decoding rfc2047 messages +;; Copyright (C) 1998, 1999, 2000, 2002, 2003 Free Software Foundation, Inc. ;; Author: Lars Magne Ingebrigtsen ;; MORIOKA Tomohiko @@ -27,22 +27,23 @@ ;;; Code: -(eval-when-compile (require 'cl)) +(eval-when-compile + (require 'cl) + (defvar message-posting-charset)) (require 'qp) (require 'mm-util) -(require 'ietf-drums) +;; Fixme: Avoid this (used for mail-parse-charset) mm dependence on gnus. (require 'mail-prsvr) (require 'base64) -;; Fixme: Avoid this (for gnus-point-at-...) mm dependence on gnus. -(require 'gnus-util) (autoload 'mm-body-7-or-8 "mm-bodies") (defvar rfc2047-header-encoding-alist '(("Newsgroups" . nil) + ("Followup-To" . nil) ("Message-ID" . nil) - ("\\(Resent-\\)?\\(From\\|Cc\\|To\\|Bcc\\|Reply-To\\|Sender\\)" . - "-A-Za-z0-9!*+/=_") + ("\\(Resent-\\)?\\(From\\|Cc\\|To\\|Bcc\\|Reply-To\\|Sender\ +\\|Mail-Followup-To\\|Mail-Copies-To\\|Approved\\)" . address-mime) (t . mime)) "*Header/encoding method alist. The list is traversed sequentially. The keys can either be @@ -52,10 +53,11 @@ The values can be: 1) nil, in which case no encoding is done; 2) `mime', in which case the header will be encoded according to RFC2047; -3) a charset, in which case it will be encoded as that charset; -4) `default', in which case the field will be encoded as the rest - of the article. -5) a string, like `mime', expect for using it as word-chars.") +3) `address-mime', like `mime', but takes account of the rules for address + fields (where quoted strings and comments must be treated separately); +4) a charset, in which case it will be encoded as that charset; +5) `default', in which case the field will be encoded as the rest + of the article.") (defvar rfc2047-charset-encoding-alist '((us-ascii . nil) @@ -65,8 +67,8 @@ The values can be: (iso-8859-4 . Q) (iso-8859-5 . B) (koi8-r . B) - (iso-8859-7 . Q) - (iso-8859-8 . Q) + (iso-8859-7 . B) + (iso-8859-8 . B) (iso-8859-9 . Q) (iso-8859-14 . Q) (iso-8859-15 . Q) @@ -79,9 +81,11 @@ The values can be: (cn-gb-2312 . B) (euc-kr . B) (iso-2022-jp-2 . B) - (iso-2022-int-1 . B)) + (iso-2022-int-1 . B) + (viscii . Q)) "Alist of MIME charsets to RFC2047 encodings. -Valid encodings are nil, `Q' and `B'.") +Valid encodings are nil, `Q' and `B'. These indicate binary (no) encoding, +quoted-printable and base64 respectively.") (defvar rfc2047-encoding-function-alist '((Q . rfc2047-q-encode-region) @@ -89,19 +93,29 @@ Valid encodings are nil, `Q' and `B'.") (nil . ignore)) "Alist of RFC2047 encodings to encoding functions.") -(defvar rfc2047-q-encoding-alist - '(("\\(Resent-\\)?\\(From\\|Cc\\|To\\|Bcc\\|Reply-To\\|Sender\\):" - . "-A-Za-z0-9!*+/" ) - ;; = (\075), _ (\137), ? (\077) are used in the encoded word. - ;; Avoid using 8bit characters. - ;; Equivalent to "^\000-\007\011\013\015-\037\200-\377=_?" - ("." . "\010\012\014\040-\074\076\100-\136\140-\177")) - "Alist of header regexps and valid Q characters.") - ;;; ;;; Functions for encoding RFC2047 messages ;;; +(defun rfc2047-qp-or-base64 () + "Return the type with which to encode the buffer. +This is either `base64' or `quoted-printable'." + (save-excursion + (let ((limit (min (point-max) (+ 2000 (point-min)))) + (n8bit 0)) + (goto-char (point-min)) + (skip-chars-forward "\x20-\x7f\r\n\t" limit) + (while (< (point) limit) + (incf n8bit) + (forward-char 1) + (skip-chars-forward "\x20-\x7f\r\n\t" limit)) + (if (or (< (* 6 n8bit) (- limit (point-min))) + ;; Don't base64, say, a short line with a single + ;; non-ASCII char when splitting parts by charset. + (= n8bit 1)) + 'quoted-printable + 'base64)))) + (defun rfc2047-narrow-to-field () "Narrow the buffer to the header on the current line." (beginning-of-line) @@ -110,12 +124,24 @@ Valid encodings are nil, `Q' and `B'.") (progn (forward-line 1) (if (re-search-forward "^[^ \n\t]" nil t) - (progn - (beginning-of-line) - (point)) + (point-at-bol) (point-max)))) (goto-char (point-min))) +(defun rfc2047-field-value () + "Return the value of the field at point." + (save-excursion + (save-restriction + (rfc2047-narrow-to-field) + (re-search-forward ":[ \t\n]*" nil t) + (buffer-substring (point) (point-max))))) + +(defvar rfc2047-encoding-type 'address-mime + "The type of encoding done by `rfc2047-encode-region'. +This should be dynamically bound around calls to +`rfc2047-encode-region' to either `mime' or `address-mime'. See +`rfc2047-header-encoding-alist', for definitions.") + (defun rfc2047-encode-message-header () "Encode the message header according to `rfc2047-header-encoding-alist'. Should be called narrowed to the head of the message." @@ -133,11 +159,10 @@ Should be called narrowed to the head of the message." (mm-coding-system-p (car message-posting-charset))) ;; 8 bit must be decoded. - ;; Is message-posting-charset a coding system? (mm-encode-coding-region (point-min) (point-max) - (car message-posting-charset)) - nil) + (mm-charset-to-coding-system + (car message-posting-charset)))) ;; No encoding necessary, but folding is nice (rfc2047-fold-region (save-excursion @@ -156,46 +181,46 @@ Should be called narrowed to the head of the message." (eq (car elem) t)) (setq alist nil method (cdr elem)))) + (goto-char (point-min)) + (re-search-forward "^[^:]+: *" nil t) (cond - ((stringp method) - (rfc2047-encode-region (point-min) (point-max) method)) + ((eq method 'address-mime) + (rfc2047-encode-region (point) (point-max))) ((eq method 'mime) - (rfc2047-encode-region (point-min) (point-max))) + (let ((rfc2047-encoding-type 'mime)) + (rfc2047-encode-region (point) (point-max)))) ((eq method 'default) (if (and (featurep 'mule) (if (boundp 'default-enable-multibyte-characters) default-enable-multibyte-characters) mail-parse-charset) - (mm-encode-coding-region (point-min) (point-max) + (mm-encode-coding-region (point) (point-max) mail-parse-charset))) ;; We get this when CC'ing messsages to newsgroups with - ;; 8-bit names. The group name mail copy just get + ;; 8-bit names. The group name mail copy just got ;; unconditionally encoded. Previously, it would ask ;; whether to encode, which was quite confusing for the ;; user. If the new behaviour is wrong, tell me. I have ;; left the old code commented out below. ;; -- Per Abrahamsen Date: 2001-10-07. + ;; Modified by Dave Love, with the commented-out code changed + ;; in accordance with changes elsewhere. ((null method) - (when (delq 'ascii - (mm-find-charset-region (point-min) (point-max))) - (rfc2047-encode-region (point-min) (point-max)))) -;;; ((null method) -;;; (and (delq 'ascii -;;; (mm-find-charset-region (point-min) -;;; (point-max))) -;;; (if (or (message-options-get -;;; 'rfc2047-encode-message-header-encode-any) -;;; (message-options-set -;;; 'rfc2047-encode-message-header-encode-any -;;; (y-or-n-p -;;; "Some texts are not encoded. Encode anyway?"))) -;;; (rfc2047-encode-region (point-min) (point-max)) -;;; (error "Cannot send unencoded text")))) + (rfc2047-encode-region (point) (point-max))) +;;; ((null method) +;;; (if (or (message-options-get +;;; 'rfc2047-encode-message-header-encode-any) +;;; (message-options-set +;;; 'rfc2047-encode-message-header-encode-any +;;; (y-or-n-p +;;; "Some texts are not encoded. Encode anyway?"))) +;;; (rfc2047-encode-region (point-min) (point-max)) +;;; (error "Cannot send unencoded text"))) ((mm-coding-system-p method) (if (and (featurep 'mule) (if (boundp 'default-enable-multibyte-characters) default-enable-multibyte-characters)) - (mm-encode-coding-region (point-min) (point-max) method))) + (mm-encode-coding-region (point) (point-max) method))) ;; Hm. (t))) (goto-char (point-max))))))) @@ -209,134 +234,234 @@ Should be called narrowed to the head of the message." The buffer may be narrowed." (require 'message) ; for message-posting-charset (let ((charsets - (mapcar - 'mm-mime-charset - (mm-find-charset-region (point-min) (point-max)))) - (cs (list 'us-ascii (car message-posting-charset))) - found) - (while charsets - (unless (memq (pop charsets) cs) - (setq found t))) - found)) - -(defun rfc2047-dissect-region (b e &optional word-chars) - "Dissect the region between B and E into words." - (unless word-chars - ;; Anything except most CTLs, WSP - (setq word-chars "\010\012\014\041-\177")) - (let (mail-parse-mule-charset - words point current - result word) - (save-restriction - (narrow-to-region b e) - (goto-char (point-min)) - (skip-chars-forward "\000-\177") - (while (not (eobp)) - (setq point (point)) - (skip-chars-backward word-chars b) - (unless (eq b (point)) - (push (cons (buffer-substring b (point)) nil) words)) - (setq b (point)) - (goto-char point) - (setq current (mm-charset-after)) - (forward-char 1) - (skip-chars-forward word-chars) - (while (and (not (eobp)) - (eq (mm-charset-after) current)) - (forward-char 1) - (skip-chars-forward word-chars)) - (unless (eq b (point)) - (push (cons (buffer-substring b (point)) current) words)) - (setq b (point)) - (skip-chars-forward "\000-\177")) - (unless (eq b (point)) - (push (cons (buffer-substring b (point)) nil) words))) - ;; merge adjacent words - (setq word (pop words)) - (while word - (if (and (cdr word) - (caar words) - (not (cdar words)) - (not (string-match "[^ \t]" (caar words)))) - (if (eq (cdr (nth 1 words)) (cdr word)) - (progn - (setq word (cons (concat - (car (nth 1 words)) (caar words) - (car word)) - (cdr word))) - (pop words) - (pop words)) - (push (cons (concat (caar words) (car word)) (cdr word)) - result) - (pop words) - (setq word (pop words))) - (push word result) - (setq word (pop words)))) - result)) - -(defun rfc2047-encode-region (b e &optional word-chars) - "Encode all encodable words in region B to E." - (let ((words (rfc2047-dissect-region b e word-chars)) word) - (save-restriction - (narrow-to-region b e) - (delete-region (point-min) (point-max)) - (while (setq word (pop words)) - (if (not (cdr word)) - (insert (car word)) - (rfc2047-fold-region (gnus-point-at-bol) (point)) - (goto-char (point-max)) - (if (> (- (point) (save-restriction - (widen) - (gnus-point-at-bol))) 76) - (insert "\n ")) - ;; Insert blank between encoded words - (if (eq (char-before) ?=) (insert " ")) - (rfc2047-encode (point) - (progn (insert (car word)) (point)) - (cdr word)))) - (rfc2047-fold-region (point-min) (point-max))))) - -(defun rfc2047-encode-string (string &optional word-chars) - "Encode words in STRING." + (mm-find-mime-charset-region (point-min) (point-max)))) + (and charsets + (not (equal charsets (list (car message-posting-charset))))))) + +;; Use this syntax table when parsing into regions that may need +;; encoding. Double quotes are string delimiters, backslash is +;; character quoting, and all other RFC 2822 special characters are +;; treated as punctuation so we can use forward-sexp/forward-word to +;; skip to the end of regions appropriately. Nb. ietf-drums does +;; things differently. +(defconst rfc2047-syntax-table + ;; (make-char-table 'syntax-table '(2)) only works in Emacs. + (let ((table (make-syntax-table))) + ;; The following is done to work for setting all elements of the table + ;; in Emacs 21 and 22 and XEmacs; it appears to be the cleanest way. + ;; Play safe and don't assume the form of the word syntax entry -- + ;; copy it from ?a. + (if (fboundp 'set-char-table-range) ; Emacs + (funcall (intern "set-char-table-range") + table t (aref (standard-syntax-table) ?a)) + (if (fboundp 'put-char-table) + (if (fboundp 'get-char-table) ; warning avoidance + (put-char-table t (get-char-table ?a (standard-syntax-table)) + table)))) + (modify-syntax-entry ?\\ "\\" table) + (modify-syntax-entry ?\" "\"" table) + (modify-syntax-entry ?\( "." table) + (modify-syntax-entry ?\) "." table) + (modify-syntax-entry ?\< "." table) + (modify-syntax-entry ?\> "." table) + (modify-syntax-entry ?\[ "." table) + (modify-syntax-entry ?\] "." table) + (modify-syntax-entry ?: "." table) + (modify-syntax-entry ?\; "." table) + (modify-syntax-entry ?, "." table) + (modify-syntax-entry ?@ "." table) + table)) + +(defun rfc2047-encode-region (b e) + "Encode words in region B to E that need encoding. +By default, the region is treated as containing RFC2822 addresses. +Dynamically bind `rfc2047-encoding-type' to change that." + (save-restriction + (narrow-to-region b e) + (if (eq 'mime rfc2047-encoding-type) + ;; Simple case. Treat as single word after any initial ASCII + ;; part and before any tailing ASCII part. The leading ASCII + ;; is relevant for instance in Subject headers with `Re:' for + ;; interoperability with non-MIME clients, and we might as + ;; well avoid the tail too. + (progn + (goto-char (point-min)) + ;; Does it need encoding? + (skip-chars-forward "\000-\177") + (unless (eobp) + (skip-chars-backward "^ \n") ; beginning of space-delimited word + (rfc2047-encode (point) (progn + (goto-char e) + (skip-chars-backward "\000-\177") + (skip-chars-forward "^ \n") + ;; end of space-delimited word + (point))))) + ;; `address-mime' case -- take care of quoted words, comments. + (with-syntax-table rfc2047-syntax-table + (let ((start) ; start of current token + end ; end of current token + ;; Whether there's an encoded word before the current + ;; token, either immediately or separated by space. + last-encoded) + (goto-char (point-min)) + (condition-case nil ; in case of unbalanced quotes + ;; Look for rfc2822-style: sequences of atoms, quoted + ;; strings, specials, whitespace. (Specials mustn't be + ;; encoded.) + (while (not (eobp)) + (setq start (point)) + ;; Skip whitespace. + (unless (= 0 (skip-chars-forward " \t\n")) + (setq start (point))) + (cond + ((not (char-after))) ; eob + ;; else token start + ((eq ?\" (char-syntax (char-after))) + ;; Quoted word. + (forward-sexp) + (setq end (point)) + ;; Does it need encoding? + (goto-char start) + (skip-chars-forward "\000-\177" end) + (if (= end (point)) + (setq last-encoded nil) + ;; It needs encoding. Strip the quotes first, + ;; since encoded words can't occur in quotes. + (goto-char end) + (delete-backward-char 1) + (goto-char start) + (delete-char 1) + (when last-encoded + ;; There was a preceding quoted word. We need + ;; to include any separating whitespace in this + ;; word to avoid it getting lost. + (skip-chars-backward " \t") + ;; A space is needed between the encoded words. + (insert ? ) + (setq start (point) + end (1+ end))) + ;; Adjust the end position for the deleted quotes. + (rfc2047-encode start (- end 2)) + (setq last-encoded t))) ; record that it was encoded + ((eq ?. (char-syntax (char-after))) + ;; Skip other delimiters, but record that they've + ;; potentially separated quoted words. + (forward-char) + (setq last-encoded nil)) + (t ; normal token/whitespace sequence + ;; Find the end. + (forward-word 1) + (skip-chars-backward " \t") + (setq end (point)) + ;; Deal with encoding and leading space as for + ;; quoted words. + (goto-char start) + (skip-chars-forward "\000-\177" end) + (if (= end (point)) + (setq last-encoded nil) + (when last-encoded + (goto-char start) + (skip-chars-backward " \t") + (insert ? ) + (setq start (point) + end (1+ end))) + (rfc2047-encode start end) + (setq last-encoded t))))) + (error + (error "Invalid data for rfc2047 encoding: %s" + (buffer-substring b e))))))) + (rfc2047-fold-region b (point)))) + +(defun rfc2047-encode-string (string) + "Encode words in STRING. +By default, the string is treated as containing addresses (see +`rfc2047-encoding-type')." (with-temp-buffer (insert string) - (rfc2047-encode-region (point-min) (point-max) word-chars) + (rfc2047-encode-region (point-min) (point-max)) (buffer-string))) -(defun rfc2047-encode (b e charset) - "Encode the word in the region B to E with CHARSET." - (let* ((mime-charset (mm-mime-charset charset)) - (cs (mm-charset-to-coding-system mime-charset)) +(defun rfc2047-encode (b e) + "Encode the word(s) in the region B to E. +By default, the region is treated as containing addresses (see +`rfc2047-encoding-type')." + (let* ((mime-charset (mm-find-mime-charset-region b e)) + (cs (if (> (length mime-charset) 1) + ;; Fixme: Instead of this, try to break region into + ;; parts that can be encoded separately. + (error "Can't rfc2047-encode `%s'" + (buffer-substring b e)) + (setq mime-charset (car mime-charset)) + (mm-charset-to-coding-system mime-charset))) + ;; Fixme: Better, calculate the number of non-ASCII + ;; characters, at least for 8-bit charsets. (encoding (or (cdr (assq mime-charset rfc2047-charset-encoding-alist)) - 'B)) + ;; For the charsets that don't have a preferred + ;; encoding, choose the one that's shorter. + (save-restriction + (narrow-to-region b e) + (if (eq (rfc2047-qp-or-base64) 'base64) + 'B + 'Q)))) (start (concat "=?" (downcase (symbol-name mime-charset)) "?" - (downcase (symbol-name encoding)) "?")) + (upcase (symbol-name encoding)) "?")) + (factor (case mime-charset + ((iso-8859-5 iso-8859-7 iso-8859-8 koi8-r) 1) + ((big5 gb2312 euc-kr) 2) + (utf-8 4) + (t 8))) + (pre (- b (save-restriction + (widen) + (point-at-bol)))) + ;; encoded-words must not be longer than 75 characters, + ;; including charset, encoding etc. This leaves us with + ;; 75 - (length start) - 2 - 2 characters. The last 2 is for + ;; possible base64 padding. In the worst case (iso-2022-*) + ;; each character expands to 8 bytes which is expanded by a + ;; factor of 4/3 by base64 encoding. + (length (floor (- 75 (length start) 4) (* factor (/ 4.0 3.0)))) + ;; Limit line length to 76 characters. + (length1 (max 1 (floor (- 76 (length start) 4 pre) + (* factor (/ 4.0 3.0))))) (first t)) + (if mime-charset + (save-restriction + (narrow-to-region b e) + (when (eq encoding 'B) + ;; break into lines before encoding + (goto-char (point-min)) + (while (not (eobp)) + (if first + (progn + (goto-char (min (point-max) (+ length1 (point)))) + (setq first nil)) + (goto-char (min (point-max) (+ length (point))))) + (unless (eobp) + (insert ?\n))) + (setq first t)) + (if (and (mm-multibyte-p) + (mm-coding-system-p cs)) + (mm-encode-coding-region (point-min) (point-max) cs)) + (funcall (cdr (assq encoding rfc2047-encoding-function-alist)) + (point-min) (point-max)) + (goto-char (point-min)) + (while (not (eobp)) + (unless first + (insert ? )) + (setq first nil) + (insert start) + (end-of-line) + (insert "?=") + (forward-line 1)))))) + +(defun rfc2047-fold-field () + "Fold the current header field." + (save-excursion (save-restriction - (narrow-to-region b e) - (when (eq encoding 'B) - ;; break into lines before encoding - (goto-char (point-min)) - (while (not (eobp)) - (goto-char (min (point-max) (+ 15 (point)))) - (unless (eobp) - (insert "\n")))) - (if (and (mm-multibyte-p) - (mm-coding-system-p cs)) - (mm-encode-coding-region (point-min) (point-max) cs)) - (funcall (cdr (assq encoding rfc2047-encoding-function-alist)) - (point-min) (point-max)) - (goto-char (point-min)) - (while (not (eobp)) - (unless first - (insert " ")) - (setq first nil) - (insert start) - (end-of-line) - (insert "?=") - (forward-line 1))))) + (rfc2047-narrow-to-field) + (rfc2047-fold-region (point-min) (point-max))))) (defun rfc2047-fold-region (b e) "Fold long lines in region B to E." @@ -345,21 +470,24 @@ The buffer may be narrowed." (goto-char (point-min)) (let ((break nil) (qword-break nil) + (first t) (bol (save-restriction (widen) - (gnus-point-at-bol)))) + (point-at-bol)))) (while (not (eobp)) - (when (and (or break qword-break) (> (- (point) bol) 76)) + (when (and (or break qword-break) + (> (- (point) bol) 76)) (goto-char (or break qword-break)) (setq break nil qword-break nil) (if (looking-at "[ \t]") - (insert "\n") + (insert ?\n) (insert "\n ")) (setq bol (1- (point))) ;; Don't break before the first non-LWSP characters. (skip-chars-forward " \t") - (unless (eobp) (forward-char 1))) + (unless (eobp) + (forward-char 1))) (cond ((eq (char-after) ?\n) (forward-char 1) @@ -373,27 +501,41 @@ The buffer may be narrowed." (forward-char 1)) ((memq (char-after) '(? ?\t)) (skip-chars-forward " \t") - (setq break (1- (point)))) + (if first + ;; Don't break just after the header name. + (setq first nil) + (setq break (1- (point))))) ((not break) (if (not (looking-at "=\\?[^=]")) (if (eq (char-after) ?=) (forward-char 1) (skip-chars-forward "^ \t\n\r=")) - (setq qword-break (point)) + ;; Don't break at the start of the field. + (unless (= (point) b) + (setq qword-break (point))) (skip-chars-forward "^ \t\n\r"))) (t (skip-chars-forward "^ \t\n\r")))) - (when (and (or break qword-break) (> (- (point) bol) 76)) + (when (and (or break qword-break) + (> (- (point) bol) 76)) (goto-char (or break qword-break)) (setq break nil qword-break nil) (if (looking-at "[ \t]") - (insert "\n") + (insert ?\n) (insert "\n ")) (setq bol (1- (point))) ;; Don't break before the first non-LWSP characters. (skip-chars-forward " \t") - (unless (eobp) (forward-char 1)))))) + (unless (eobp) + (forward-char 1)))))) + +(defun rfc2047-unfold-field () + "Fold the current line." + (save-excursion + (save-restriction + (rfc2047-narrow-to-field) + (rfc2047-unfold-region (point-min) (point-max))))) (defun rfc2047-unfold-region (b e) "Unfold lines in region B to E." @@ -402,21 +544,18 @@ The buffer may be narrowed." (goto-char (point-min)) (let ((bol (save-restriction (widen) - (gnus-point-at-bol))) - (eol (gnus-point-at-eol)) - leading) + (point-at-bol))) + (eol (point-at-eol))) (forward-line 1) (while (not (eobp)) - (looking-at "[ \t]*") - (setq leading (- (match-end 0) (match-beginning 0))) - (if (< (- (gnus-point-at-eol) bol leading) 76) - (progn - (goto-char eol) - (delete-region eol (progn - (skip-chars-forward " \t\n\r") - (1- (point))))) - (setq bol (gnus-point-at-bol))) - (setq eol (gnus-point-at-eol)) + (if (and (looking-at "[ \t]") + (< (- (point-at-eol) bol) 76)) + (delete-region eol (progn + (goto-char eol) + (skip-chars-forward "\r\n") + (point))) + (setq bol (point-at-bol))) + (setq eol (point-at-eol)) (forward-line 1))))) (defun rfc2047-b-encode-region (b e) @@ -434,18 +573,21 @@ The buffer may be narrowed." (save-excursion (save-restriction (narrow-to-region (goto-char b) e) - (let ((alist rfc2047-q-encoding-alist) - (bol (save-restriction + (let ((bol (save-restriction (widen) - (gnus-point-at-bol)))) - (while alist - (when (looking-at (caar alist)) - (mm-with-unibyte-current-buffer-mule4 - (quoted-printable-encode-region - (point-min) (point-max) nil (cdar alist))) - (subst-char-in-region (point-min) (point-max) ? ?_) - (setq alist nil)) - (pop alist)) + (point-at-bol)))) + (quoted-printable-encode-region + b e nil + ;; = (\075), _ (\137), ? (\077) are used in the encoded word. + ;; Avoid using 8bit characters. + ;; This list excludes `especials' (see the RFC2047 syntax), + ;; meaning that some characters in non-structured fields will + ;; get encoded when they con't need to be. The following is + ;; what it used to be. +;;; ;; Equivalent to "^\000-\007\011\013\015-\037\200-\377=_?" +;;; "\010\012\014\040-\074\076\100-\136\140-\177") + "-\b\n\f !#-'*+0-9A-Z\\^`-~\d") + (subst-char-in-region (point-min) (point-max) ? ?_) ;; The size of QP encapsulation is about 20, so set limit to ;; 56=76-20. (unless (< (- (point-max) (point-min)) 56) @@ -456,15 +598,27 @@ The buffer may be narrowed." (goto-char (min (point-max) (+ 56 bol))) (search-backward "=" (- (point) 2) t) (unless (or (bobp) (eobp)) - (insert "\n") + (insert ?\n) (setq bol (point))))))))) ;;; ;;; Functions for decoding RFC2047 messages ;;; -(defvar rfc2047-encoded-word-regexp - "=\\?\\([^][\000-\040()<>@,\;:\\\"/?.=]+\\)\\?\\(B\\|Q\\)\\?\\([!->@-~ +]*\\)\\?=") +(eval-and-compile + (defconst rfc2047-encoded-word-regexp + "=\\?\\([^][\000-\040()<>@,\;:*\\\"/?.=]+\\)\\(?:\\*[^?]+\\)?\ +\\?\\(B\\|Q\\)\\?\\([!->@-~ ]*\\)\\?=")) + +;; Fixme: This should decode in place, not cons intermediate strings. +;; Also check whether it needs to worry about delimiting fields like +;; encoding. + +;; In fact it's reported that (invalid) encoding of mailboxes in +;; addr-specs is in use, so delimiting fields might help. Probably +;; not decoding a word which isn't properly delimited is good enough +;; and worthwhile (is it more correct or not?), e.g. something like +;; `=?iso-8859-1?q?foo?=@'. (defun rfc2047-decode-region (start end) "Decode MIME-encoded words in region between START and END." @@ -477,9 +631,10 @@ The buffer may be narrowed." (goto-char (point-min)) ;; Remove whitespace between encoded words. (while (re-search-forward - (concat "\\(" rfc2047-encoded-word-regexp "\\)" - "\\(\n?[ \t]\\)+" - "\\(" rfc2047-encoded-word-regexp "\\)") + (eval-when-compile + (concat "\\(" rfc2047-encoded-word-regexp "\\)" + "\\(\n?[ \t]\\)+" + "\\(" rfc2047-encoded-word-regexp "\\)")) nil t) (delete-region (goto-char (match-end 1)) (match-beginning 6))) ;; Decode the encoded words. @@ -490,8 +645,17 @@ The buffer may be narrowed." (prog1 (match-string 0) (delete-region (match-beginning 0) (match-end 0))))) + ;; Remove newlines between decoded words, though such things + ;; essentially must not be there. + (save-restriction + (narrow-to-region e (point)) + (goto-char e) + (while (re-search-forward "[\n\r]+" nil t) + (replace-match " ")) + (goto-char (point-max))) (when (and (mm-multibyte-p) mail-parse-charset + (not (eq mail-parse-charset 'us-ascii)) (not (eq mail-parse-charset 'gnus-decoded))) (mm-decode-coding-region b e mail-parse-charset)) (setq b (point))) @@ -499,48 +663,79 @@ The buffer may be narrowed." mail-parse-charset (not (eq mail-parse-charset 'us-ascii)) (not (eq mail-parse-charset 'gnus-decoded))) - (mm-decode-coding-region b (point-max) mail-parse-charset)) - (rfc2047-unfold-region (point-min) (point-max)))))) + (mm-decode-coding-region b (point-max) mail-parse-charset)))))) (defun rfc2047-decode-string (string) "Decode the quoted-printable-encoded STRING and return the results." (let ((m (mm-multibyte-p))) - (with-temp-buffer - (when m - (mm-enable-multibyte)) - (insert string) - (inline - (rfc2047-decode-region (point-min) (point-max))) - (buffer-string)))) + (if (string-match "=\\?" string) + (with-temp-buffer + ;; Fixme: This logic is wrong, but seems to be required by + ;; Gnus summary buffer generation. The value of `m' depends + ;; on the current buffer, not global multibyteness or that + ;; of the string. Also the string returned should always be + ;; multibyte in a multibyte session, i.e. the buffer should + ;; be multibyte before `buffer-string' is called. + (when m + (mm-enable-multibyte)) + (insert string) + (inline + (rfc2047-decode-region (point-min) (point-max))) + (buffer-string)) + ;; Fixme: As above, `m' here is inappropriate. + (if (and m + mail-parse-charset + (not (eq mail-parse-charset 'us-ascii)) + (not (eq mail-parse-charset 'gnus-decoded))) + ;; `decode-coding-string' in Emacs offers a third optional + ;; arg NOCOPY to avoid consing a new string if the decoding + ;; is "trivial". Unfortunately it currently doesn't + ;; consider anything else than a `nil' coding system + ;; trivial. + ;; `rfc2047-decode-string' is called multiple times for each + ;; article during summary buffer generation, and we really + ;; want to avoid unnecessary consing. So we bypass + ;; `decode-coding-string' if the string is purely ASCII. + (if (and (fboundp 'detect-coding-string) + ;; string is purely ASCII + (eq (detect-coding-string string t) 'undecided)) + string + (mm-decode-coding-string string mail-parse-charset)) + (mm-string-as-multibyte string))))) (defun rfc2047-parse-and-decode (word) "Decode WORD and return it if it is an encoded word. -Return WORD if not." +Return WORD if it is not not an encoded word or if the charset isn't +decodable." (if (not (string-match rfc2047-encoded-word-regexp word)) word (or (condition-case nil (rfc2047-decode (match-string 1 word) - (upcase (match-string 2 word)) + (string-to-char (match-string 2 word)) (match-string 3 word)) (error word)) - word))) + word))) ; un-decodable (defun rfc2047-pad-base64 (string) "Pad STRING to quartets." ;; Be more liberal to accept buggy base64 strings. If ;; base64-decode-string accepts buggy strings, this function could ;; be aliased to identity. - (case (mod (length string) 4) - (0 string) - (1 string) ;; Error, don't pad it. - (2 (concat string "==")) - (3 (concat string "=")))) + (if (= 0 (mod (length string) 4)) + string + (when (string-match "=+$" string) + (setq string (substring string 0 (match-beginning 0)))) + (case (mod (length string) 4) + (0 string) + (1 string) ;; Error, don't pad it. + (2 (concat string "==")) + (3 (concat string "="))))) (defun rfc2047-decode (charset encoding string) "Decode STRING from the given MIME CHARSET in the given ENCODING. -Valid ENCODINGs are \"B\" and \"Q\". +Valid ENCODINGs are the characters \"B\" and \"Q\". If your Emacs implementation can't decode CHARSET, return nil." (if (stringp charset) (setq charset (intern (downcase charset)))) @@ -558,20 +753,17 @@ If your Emacs implementation can't decode CHARSET, return nil." (when (and (eq cs 'ascii) mail-parse-charset) (setq cs mail-parse-charset)) - (mm-with-unibyte-current-buffer-mule4 - ;; In Emacs Mule 4, decoding UTF-8 should be in unibyte mode. - (mm-decode-coding-string - (cond - ((equal "B" encoding) - (base64-decode-string - (rfc2047-pad-base64 string))) - ((equal "Q" encoding) - (quoted-printable-decode-string - (mm-replace-chars-in-string string ?_ ? ))) - (t (error "Invalid encoding: %s" encoding))) - cs))))) + (mm-decode-coding-string + (cond + ((char-equal ?B encoding) + (base64-decode-string + (rfc2047-pad-base64 string))) + ((char-equal ?Q encoding) + (quoted-printable-decode-string + (mm-subst-char-in-string ?_ ? string t))) + (t (error "Invalid encoding: %c" encoding))) + cs)))) (provide 'rfc2047) ;;; rfc2047.el ends here -