X-Git-Url: http://cgit.sxemacs.org/?a=blobdiff_plain;f=lisp%2Frfc2047.el;h=ebd8a4a41bcb5a350be5f0480d3e8c04fb1123df;hb=0ed5b1813c561b0290b3ebbe2e0eb9bba6156076;hp=7a86311396ec9b472e9a049fcc3b185b9b6e5837;hpb=348561af2b5950d72a20c41e12af9f18fd5fd0d3;p=gnus diff --git a/lisp/rfc2047.el b/lisp/rfc2047.el index 7a8631139..ebd8a4a41 100644 --- a/lisp/rfc2047.el +++ b/lisp/rfc2047.el @@ -1,5 +1,5 @@ -;;; rfc2047.el --- Functions for encoding and decoding rfc2047 messages -;; Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc. +;;; rfc2047.el --- functions for encoding and decoding rfc2047 messages +;; Copyright (C) 1998, 1999, 2000, 2002, 2003 Free Software Foundation, Inc. ;; Author: Lars Magne Ingebrigtsen ;; MORIOKA Tomohiko @@ -22,32 +22,41 @@ ;;; Commentary: +;; RFC 2047 is "MIME (Multipurpose Internet Mail Extensions) Part +;; Three: Message Header Extensions for Non-ASCII Text". + ;;; Code: -(eval-and-compile - (eval - '(unless (fboundp 'base64-decode-string) - (require 'base64)))) +(eval-when-compile + (require 'cl) + (defvar message-posting-charset)) (require 'qp) (require 'mm-util) -(require 'ietf-drums) +;; Fixme: Avoid this (used for mail-parse-charset) mm dependence on gnus. (require 'mail-prsvr) +(require 'base64) +(autoload 'mm-body-7-or-8 "mm-bodies") (defvar rfc2047-header-encoding-alist '(("Newsgroups" . nil) + ("Followup-To" . nil) ("Message-ID" . nil) + ("\\(Resent-\\)?\\(From\\|Cc\\|To\\|Bcc\\|Reply-To\\|Sender\ +\\|Mail-Followup-To\\|Mail-Copies-To\\|Approved\\)" . address-mime) (t . mime)) "*Header/encoding method alist. The list is traversed sequentially. The keys can either be -header regexps or `t'. +header regexps or t. The values can be: 1) nil, in which case no encoding is done; 2) `mime', in which case the header will be encoded according to RFC2047; -3) a charset, in which case it will be encoded as that charset; -4) `default', in which case the field will be encoded as the rest +3) `address-mime', like `mime', but takes account of the rules for address + fields (where quoted strings and comments must be treated separately); +4) a charset, in which case it will be encoded as that charset; +5) `default', in which case the field will be encoded as the rest of the article.") (defvar rfc2047-charset-encoding-alist @@ -58,19 +67,25 @@ The values can be: (iso-8859-4 . Q) (iso-8859-5 . B) (koi8-r . B) - (iso-8859-7 . Q) - (iso-8859-8 . Q) + (iso-8859-7 . B) + (iso-8859-8 . B) (iso-8859-9 . Q) + (iso-8859-14 . Q) + (iso-8859-15 . Q) (iso-2022-jp . B) (iso-2022-kr . B) (gb2312 . B) + (big5 . B) + (cn-big5 . B) (cn-gb . B) (cn-gb-2312 . B) (euc-kr . B) (iso-2022-jp-2 . B) - (iso-2022-int-1 . B)) + (iso-2022-int-1 . B) + (viscii . Q)) "Alist of MIME charsets to RFC2047 encodings. -Valid encodings are nil, `Q' and `B'.") +Valid encodings are nil, `Q' and `B'. These indicate binary (no) encoding, +quoted-printable and base64 respectively.") (defvar rfc2047-encoding-function-alist '((Q . rfc2047-q-encode-region) @@ -78,15 +93,29 @@ Valid encodings are nil, `Q' and `B'.") (nil . ignore)) "Alist of RFC2047 encodings to encoding functions.") -(defvar rfc2047-q-encoding-alist - '(("\\(From\\|Cc\\|To\\|Bcc\||Reply-To\\):" . "-A-Za-z0-9!*+/=_") - ("." . "^\000-\007\011\013\015-\037\200-\377=_?")) - "Alist of header regexps and valid Q characters.") - ;;; ;;; Functions for encoding RFC2047 messages ;;; +(defun rfc2047-qp-or-base64 () + "Return the type with which to encode the buffer. +This is either `base64' or `quoted-printable'." + (save-excursion + (let ((limit (min (point-max) (+ 2000 (point-min)))) + (n8bit 0)) + (goto-char (point-min)) + (skip-chars-forward "\x20-\x7f\r\n\t" limit) + (while (< (point) limit) + (incf n8bit) + (forward-char 1) + (skip-chars-forward "\x20-\x7f\r\n\t" limit)) + (if (or (< (* 6 n8bit) (- limit (point-min))) + ;; Don't base64, say, a short line with a single + ;; non-ASCII char when splitting parts by charset. + (= n8bit 1)) + 'quoted-printable + 'base64)))) + (defun rfc2047-narrow-to-field () "Narrow the buffer to the header on the current line." (beginning-of-line) @@ -95,12 +124,24 @@ Valid encodings are nil, `Q' and `B'.") (progn (forward-line 1) (if (re-search-forward "^[^ \n\t]" nil t) - (progn - (beginning-of-line) - (point)) + (point-at-bol) (point-max)))) (goto-char (point-min))) +(defun rfc2047-field-value () + "Return the value of the field at point." + (save-excursion + (save-restriction + (rfc2047-narrow-to-field) + (re-search-forward ":[ \t\n]*" nil t) + (buffer-substring (point) (point-max))))) + +(defvar rfc2047-encoding-type 'address-mime + "The type of encoding done by `rfc2047-encode-region'. +This should be dynamically bound around calls to +`rfc2047-encode-region' to either `mime' or `address-mime'. See +`rfc2047-header-encoding-alist', for definitions.") + (defun rfc2047-encode-message-header () "Encode the message header according to `rfc2047-header-encoding-alist'. Should be called narrowed to the head of the message." @@ -112,15 +153,25 @@ Should be called narrowed to the head of the message." (save-restriction (rfc2047-narrow-to-field) (if (not (rfc2047-encodable-p)) - (if (and (eq (mm-body-7-or-8) '8bit) - (mm-multibyte-p) - (mm-coding-system-p - (car message-posting-charset))) - ;; 8 bit must be decoded. - ;; Is message-posting-charset a coding system? - (mm-encode-coding-region - (point-min) (point-max) - (car message-posting-charset))) + (prog1 + (if (and (eq (mm-body-7-or-8) '8bit) + (mm-multibyte-p) + (mm-coding-system-p + (car message-posting-charset))) + ;; 8 bit must be decoded. + (mm-encode-coding-region + (point-min) (point-max) + (mm-charset-to-coding-system + (car message-posting-charset)))) + ;; No encoding necessary, but folding is nice + (rfc2047-fold-region + (save-excursion + (goto-char (point-min)) + (skip-chars-forward "^:") + (when (looking-at ": ") + (forward-char 2)) + (point)) + (point-max))) ;; We found something that may perhaps be encoded. (setq method nil alist rfc2047-header-encoding-alist) @@ -130,179 +181,385 @@ Should be called narrowed to the head of the message." (eq (car elem) t)) (setq alist nil method (cdr elem)))) + (goto-char (point-min)) + (re-search-forward "^[^:]+: *" nil t) (cond + ((eq method 'address-mime) + (rfc2047-encode-region (point) (point-max))) ((eq method 'mime) - (rfc2047-encode-region (point-min) (point-max)) - (rfc2047-fold-region (point-min) (point-max))) + (let ((rfc2047-encoding-type 'mime)) + (rfc2047-encode-region (point) (point-max)))) ((eq method 'default) (if (and (featurep 'mule) + (if (boundp 'default-enable-multibyte-characters) + default-enable-multibyte-characters) mail-parse-charset) - (mm-encode-coding-region (point-min) (point-max) + (mm-encode-coding-region (point) (point-max) mail-parse-charset))) + ;; We get this when CC'ing messsages to newsgroups with + ;; 8-bit names. The group name mail copy just got + ;; unconditionally encoded. Previously, it would ask + ;; whether to encode, which was quite confusing for the + ;; user. If the new behaviour is wrong, tell me. I have + ;; left the old code commented out below. + ;; -- Per Abrahamsen Date: 2001-10-07. + ;; Modified by Dave Love, with the commented-out code changed + ;; in accordance with changes elsewhere. + ((null method) + (rfc2047-encode-region (point) (point-max))) +;;; ((null method) +;;; (if (or (message-options-get +;;; 'rfc2047-encode-message-header-encode-any) +;;; (message-options-set +;;; 'rfc2047-encode-message-header-encode-any +;;; (y-or-n-p +;;; "Some texts are not encoded. Encode anyway?"))) +;;; (rfc2047-encode-region (point-min) (point-max)) +;;; (error "Cannot send unencoded text"))) ((mm-coding-system-p method) - (if (featurep 'mule) - (mm-encode-coding-region (point-min) (point-max) method))) + (if (and (featurep 'mule) + (if (boundp 'default-enable-multibyte-characters) + default-enable-multibyte-characters)) + (mm-encode-coding-region (point) (point-max) method))) ;; Hm. (t))) (goto-char (point-max))))))) -(defun rfc2047-encodable-p (&optional header) - "Say whether the current (narrowed) buffer contains characters that need encoding in headers." +;; Fixme: This, and the require below may not be the Right Thing, but +;; should be safe just before release. -- fx 2001-02-08 +(eval-when-compile (defvar message-posting-charset)) + +(defun rfc2047-encodable-p () + "Return non-nil if any characters in current buffer need encoding in headers. +The buffer may be narrowed." + (require 'message) ; for message-posting-charset (let ((charsets - (mapcar - 'mm-mime-charset - (mm-find-charset-region (point-min) (point-max)))) - (cs (list 'us-ascii (car message-posting-charset))) - found) - (while charsets - (unless (memq (pop charsets) cs) - (setq found t))) - found)) - -(defun rfc2047-dissect-region (b e) - "Dissect the region between B and E into words." - (let ((all-specials (concat ietf-drums-tspecials " \t\n\r")) - (special-list (mapcar 'identity ietf-drums-tspecials)) - (blank-list '(? ?\t ?\n ?\r)) - words current cs state mail-parse-mule-charset) - (save-restriction - (narrow-to-region b e) - (goto-char (point-min)) - (skip-chars-forward all-specials) - (setq b (point)) - (while (not (eobp)) - (cond - ((not state) - (setq state 'word) - (if (not (eq (setq cs (mm-charset-after)) 'ascii)) - (setq current cs)) - (setq b (point))) - ((eq state 'blank) - (cond - ((memq (char-after) special-list) - (setq state nil)) - ((memq (char-after) blank-list)) - (t - (setq state 'word) - (unless b - (setq b (point))) - (if (not (eq (setq cs (mm-charset-after)) 'ascii)) - (setq current cs))))) - ((eq state 'word) - (cond - ((memq (char-after) special-list) - (setq state nil) - (push (list b (point) current) words) - (setq current nil)) - ((memq (char-after) blank-list) - (setq state 'blank) - (if (not current) - (setq b nil) - (push (list b (point) current) words) - (setq b (point)) - (setq current nil))) - ((or (eq (setq cs (mm-charset-after)) 'ascii) - (if current - (eq current cs) - (setq current cs)))) - (t - (push (list b (point) current) words) - (setq current cs) - (setq b (point)))))) - (if state - (forward-char) - (skip-chars-forward all-specials))) - (if (eq state 'word) - (push (list b (point) current) words))) - words)) + (mm-find-mime-charset-region (point-min) (point-max)))) + (and charsets + (not (equal charsets (list (car message-posting-charset))))))) + +;; Use this syntax table when parsing into regions that may need +;; encoding. Double quotes are string delimiters, backslash is +;; character quoting, and all other RFC 2822 special characters are +;; treated as punctuation so we can use forward-sexp/forward-word to +;; skip to the end of regions appropriately. Nb. ietf-drums does +;; things differently. +(defconst rfc2047-syntax-table + ;; (make-char-table 'syntax-table '(2)) only works in Emacs. + (let ((table (make-syntax-table))) + ;; The following is done to work for setting all elements of the table + ;; in Emacs 21 and 22 and XEmacs; it appears to be the cleanest way. + ;; Play safe and don't assume the form of the word syntax entry -- + ;; copy it from ?a. + (if (fboundp 'set-char-table-range) ; Emacs + (funcall (intern "set-char-table-range") + table t (aref (standard-syntax-table) ?a)) + (if (fboundp 'put-char-table) + (if (fboundp 'get-char-table) ; warning avoidance + (put-char-table t (get-char-table ?a (standard-syntax-table)) + table)))) + (modify-syntax-entry ?\\ "\\" table) + (modify-syntax-entry ?\" "\"" table) + (modify-syntax-entry ?\( "." table) + (modify-syntax-entry ?\) "." table) + (modify-syntax-entry ?\< "." table) + (modify-syntax-entry ?\> "." table) + (modify-syntax-entry ?\[ "." table) + (modify-syntax-entry ?\] "." table) + (modify-syntax-entry ?: "." table) + (modify-syntax-entry ?\; "." table) + (modify-syntax-entry ?, "." table) + (modify-syntax-entry ?@ "." table) + table)) (defun rfc2047-encode-region (b e) - "Encode all encodable words in REGION." - (let ((words (rfc2047-dissect-region b e)) - beg end current word) - (while (setq word (pop words)) - (if (equal (nth 2 word) current) - (setq beg (nth 0 word)) - (when current - (if (and (eq beg (nth 1 word)) (nth 2 word)) - (progn - ;; There might be a bug in Emacs Mule. - ;; A space must be inserted before encoding. - (goto-char beg) - (insert " ") - (rfc2047-encode (1+ beg) (1+ end) current)) - (rfc2047-encode beg end current))) - (setq current (nth 2 word) - beg (nth 0 word) - end (nth 1 word)))) - (when current - (rfc2047-encode beg end current)))) + "Encode words in region B to E that need encoding. +By default, the region is treated as containing RFC2822 addresses. +Dynamically bind `rfc2047-encoding-type' to change that." + (save-restriction + (narrow-to-region b e) + (if (eq 'mime rfc2047-encoding-type) + ;; Simple case. Treat as single word after any initial ASCII + ;; part and before any tailing ASCII part. The leading ASCII + ;; is relevant for instance in Subject headers with `Re:' for + ;; interoperability with non-MIME clients, and we might as + ;; well avoid the tail too. + (progn + (goto-char (point-min)) + ;; Does it need encoding? + (skip-chars-forward "\000-\177") + (unless (eobp) + (skip-chars-backward "^ \n") ; beginning of space-delimited word + (rfc2047-encode (point) (progn + (goto-char e) + (skip-chars-backward "\000-\177") + (skip-chars-forward "^ \n") + ;; end of space-delimited word + (point))))) + ;; `address-mime' case -- take care of quoted words, comments. + (with-syntax-table rfc2047-syntax-table + (let ((start) ; start of current token + end ; end of current token + ;; Whether there's an encoded word before the current + ;; token, either immediately or separated by space. + last-encoded) + (goto-char (point-min)) + (condition-case nil ; in case of unbalanced quotes + ;; Look for rfc2822-style: sequences of atoms, quoted + ;; strings, specials, whitespace. (Specials mustn't be + ;; encoded.) + (while (not (eobp)) + (setq start (point)) + ;; Skip whitespace. + (unless (= 0 (skip-chars-forward " \t\n")) + (setq start (point))) + (cond + ((not (char-after))) ; eob + ;; else token start + ((eq ?\" (char-syntax (char-after))) + ;; Quoted word. + (forward-sexp) + (setq end (point)) + ;; Does it need encoding? + (goto-char start) + (skip-chars-forward "\000-\177" end) + (if (= end (point)) + (setq last-encoded nil) + ;; It needs encoding. Strip the quotes first, + ;; since encoded words can't occur in quotes. + (goto-char end) + (delete-backward-char 1) + (goto-char start) + (delete-char 1) + (when last-encoded + ;; There was a preceding quoted word. We need + ;; to include any separating whitespace in this + ;; word to avoid it getting lost. + (skip-chars-backward " \t") + ;; A space is needed between the encoded words. + (insert ? ) + (setq start (point) + end (1+ end))) + ;; Adjust the end position for the deleted quotes. + (rfc2047-encode start (- end 2)) + (setq last-encoded t))) ; record that it was encoded + ((eq ?. (char-syntax (char-after))) + ;; Skip other delimiters, but record that they've + ;; potentially separated quoted words. + (forward-char) + (setq last-encoded nil)) + (t ; normal token/whitespace sequence + ;; Find the end. + (forward-word 1) + (skip-chars-backward " \t") + (setq end (point)) + ;; Deal with encoding and leading space as for + ;; quoted words. + (goto-char start) + (skip-chars-forward "\000-\177" end) + (if (= end (point)) + (setq last-encoded nil) + (when last-encoded + (goto-char start) + (skip-chars-backward " \t") + (insert ? ) + (setq start (point) + end (1+ end))) + (rfc2047-encode start end) + (setq last-encoded t))))) + (error + (error "Invalid data for rfc2047 encoding: %s" + (buffer-substring b e))))))) + (rfc2047-fold-region b (point)))) (defun rfc2047-encode-string (string) - "Encode words in STRING." + "Encode words in STRING. +By default, the string is treated as containing addresses (see +`rfc2047-encoding-type')." (with-temp-buffer (insert string) (rfc2047-encode-region (point-min) (point-max)) (buffer-string))) -(defun rfc2047-encode (b e charset) - "Encode the word in the region with CHARSET." - (let* ((mime-charset (mm-mime-charset charset)) +(defun rfc2047-encode (b e) + "Encode the word(s) in the region B to E. +By default, the region is treated as containing addresses (see +`rfc2047-encoding-type')." + (let* ((mime-charset (mm-find-mime-charset-region b e)) + (cs (if (> (length mime-charset) 1) + ;; Fixme: Instead of this, try to break region into + ;; parts that can be encoded separately. + (error "Can't rfc2047-encode `%s'" + (buffer-substring b e)) + (setq mime-charset (car mime-charset)) + (mm-charset-to-coding-system mime-charset))) + ;; Fixme: Better, calculate the number of non-ASCII + ;; characters, at least for 8-bit charsets. (encoding (or (cdr (assq mime-charset rfc2047-charset-encoding-alist)) - 'B)) + ;; For the charsets that don't have a preferred + ;; encoding, choose the one that's shorter. + (save-restriction + (narrow-to-region b e) + (if (eq (rfc2047-qp-or-base64) 'base64) + 'B + 'Q)))) (start (concat "=?" (downcase (symbol-name mime-charset)) "?" (downcase (symbol-name encoding)) "?")) + (factor (case mime-charset + ((iso-8859-5 iso-8859-7 iso-8859-8 koi8-r) 1) + ((big5 gb2312 euc-kr) 2) + (utf-8 4) + (t 8))) + (pre (- b (save-restriction + (widen) + (point-at-bol)))) + ;; encoded-words must not be longer than 75 characters, + ;; including charset, encoding etc. This leaves us with + ;; 75 - (length start) - 2 - 2 characters. The last 2 is for + ;; possible base64 padding. In the worst case (iso-2022-*) + ;; each character expands to 8 bytes which is expanded by a + ;; factor of 4/3 by base64 encoding. + (length (floor (- 75 (length start) 4) (* factor (/ 4.0 3.0)))) + ;; Limit line length to 76 characters. + (length1 (max 1 (floor (- 76 (length start) 4 pre) + (* factor (/ 4.0 3.0))))) (first t)) + (if mime-charset + (save-restriction + (narrow-to-region b e) + (when (eq encoding 'B) + ;; break into lines before encoding + (goto-char (point-min)) + (while (not (eobp)) + (if first + (progn + (goto-char (min (point-max) (+ length1 (point)))) + (setq first nil)) + (goto-char (min (point-max) (+ length (point))))) + (unless (eobp) + (insert ?\n))) + (setq first t)) + (if (and (mm-multibyte-p) + (mm-coding-system-p cs)) + (mm-encode-coding-region (point-min) (point-max) cs)) + (funcall (cdr (assq encoding rfc2047-encoding-function-alist)) + (point-min) (point-max)) + (goto-char (point-min)) + (while (not (eobp)) + (unless first + (insert ? )) + (setq first nil) + (insert start) + (end-of-line) + (insert "?=") + (forward-line 1)))))) + +(defun rfc2047-fold-field () + "Fold the current header field." + (save-excursion (save-restriction - (narrow-to-region b e) - (when (eq encoding 'B) - ;; break into lines before encoding - (goto-char (point-min)) - (while (not (eobp)) - (goto-char (min (point-max) (+ 15 (point)))) - (unless (eobp) - (insert "\n")))) - (if (and (mm-multibyte-p) - (mm-coding-system-p mime-charset)) - (mm-encode-coding-region (point-min) (point-max) mime-charset)) - (funcall (cdr (assq encoding rfc2047-encoding-function-alist)) - (point-min) (point-max)) - (goto-char (point-min)) - (while (not (eobp)) - (unless first - (insert " ")) - (setq first nil) - (insert start) - (end-of-line) - (insert "?=") - (forward-line 1))))) + (rfc2047-narrow-to-field) + (rfc2047-fold-region (point-min) (point-max))))) (defun rfc2047-fold-region (b e) - "Fold the long lines in the region." + "Fold long lines in region B to E." (save-restriction (narrow-to-region b e) (goto-char (point-min)) - (let ((break nil)) + (let ((break nil) + (qword-break nil) + (first t) + (bol (save-restriction + (widen) + (point-at-bol)))) (while (not (eobp)) + (when (and (or break qword-break) + (> (- (point) bol) 76)) + (goto-char (or break qword-break)) + (setq break nil + qword-break nil) + (if (looking-at "[ \t]") + (insert ?\n) + (insert "\n ")) + (setq bol (1- (point))) + ;; Don't break before the first non-LWSP characters. + (skip-chars-forward " \t") + (unless (eobp) + (forward-char 1))) (cond + ((eq (char-after) ?\n) + (forward-char 1) + (setq bol (point) + break nil + qword-break nil) + (skip-chars-forward " \t") + (unless (or (eobp) (eq (char-after) ?\n)) + (forward-char 1))) + ((eq (char-after) ?\r) + (forward-char 1)) ((memq (char-after) '(? ?\t)) - (setq break (point))) - ((and (not break) - (looking-at "=\\?")) - (setq break (point))) - ((and break - (looking-at "\\?=") - (> (- (point) (save-excursion (beginning-of-line) (point))) 76)) - (goto-char break) - (setq break nil) - (insert "\n "))) + (skip-chars-forward " \t") + (if first + ;; Don't break just after the header name. + (setq first nil) + (setq break (1- (point))))) + ((not break) + (if (not (looking-at "=\\?[^=]")) + (if (eq (char-after) ?=) + (forward-char 1) + (skip-chars-forward "^ \t\n\r=")) + ;; Don't break at the start of the field. + (unless (= (point) b) + (setq qword-break (point))) + (skip-chars-forward "^ \t\n\r"))) + (t + (skip-chars-forward "^ \t\n\r")))) + (when (and (or break qword-break) + (> (- (point) bol) 76)) + (goto-char (or break qword-break)) + (setq break nil + qword-break nil) + (if (looking-at "[ \t]") + (insert ?\n) + (insert "\n ")) + (setq bol (1- (point))) + ;; Don't break before the first non-LWSP characters. + (skip-chars-forward " \t") (unless (eobp) (forward-char 1)))))) +(defun rfc2047-unfold-field () + "Fold the current line." + (save-excursion + (save-restriction + (rfc2047-narrow-to-field) + (rfc2047-unfold-region (point-min) (point-max))))) + +(defun rfc2047-unfold-region (b e) + "Unfold lines in region B to E." + (save-restriction + (narrow-to-region b e) + (goto-char (point-min)) + (let ((bol (save-restriction + (widen) + (point-at-bol))) + (eol (point-at-eol))) + (forward-line 1) + (while (not (eobp)) + (if (and (looking-at "[ \t]") + (< (- (point-at-eol) bol) 76)) + (delete-region eol (progn + (goto-char eol) + (skip-chars-forward "\r\n") + (point))) + (setq bol (point-at-bol))) + (setq eol (point-at-eol)) + (forward-line 1))))) + (defun rfc2047-b-encode-region (b e) - "Encode the header contained in REGION with the B encoding." + "Base64-encode the header contained in region B to E." (save-restriction (narrow-to-region (goto-char b) e) (while (not (eobp)) @@ -312,30 +569,56 @@ Should be called narrowed to the head of the message." (forward-line)))) (defun rfc2047-q-encode-region (b e) - "Encode the header contained in REGION with the Q encoding." + "Quoted-printable-encode the header in region B to E." (save-excursion (save-restriction (narrow-to-region (goto-char b) e) - (let ((alist rfc2047-q-encoding-alist)) - (while alist - (when (looking-at (caar alist)) - (quoted-printable-encode-region b e nil (cdar alist)) - (subst-char-in-region (point-min) (point-max) ? ?_) - (setq alist nil)) - (pop alist)) - (goto-char (point-min)) - (while (not (eobp)) - (goto-char (min (point-max) (+ 64 (point)))) - (search-backward "=" (- (point) 2) t) - (unless (eobp) - (insert "\n"))))))) + (let ((bol (save-restriction + (widen) + (point-at-bol)))) + (quoted-printable-encode-region + b e nil + ;; = (\075), _ (\137), ? (\077) are used in the encoded word. + ;; Avoid using 8bit characters. + ;; This list excludes `especials' (see the RFC2047 syntax), + ;; meaning that some characters in non-structured fields will + ;; get encoded when they con't need to be. The following is + ;; what it used to be. +;;; ;; Equivalent to "^\000-\007\011\013\015-\037\200-\377=_?" +;;; "\010\012\014\040-\074\076\100-\136\140-\177") + "-\b\n\f !#-'*+0-9A-Z\\^`-~\d") + (subst-char-in-region (point-min) (point-max) ? ?_) + ;; The size of QP encapsulation is about 20, so set limit to + ;; 56=76-20. + (unless (< (- (point-max) (point-min)) 56) + ;; Don't break if it could fit in one line. + ;; Let rfc2047-encode-region break it later. + (goto-char (1+ (point-min))) + (while (and (not (bobp)) (not (eobp))) + (goto-char (min (point-max) (+ 56 bol))) + (search-backward "=" (- (point) 2) t) + (unless (or (bobp) (eobp)) + (insert ?\n) + (setq bol (point))))))))) ;;; ;;; Functions for decoding RFC2047 messages ;;; -(defvar rfc2047-encoded-word-regexp - "=\\?\\([^][\000-\040()<>@,\;:\\\"/?.=]+\\)\\?\\(B\\|Q\\)\\?\\([!->@-~ +]+\\)\\?=") +(eval-and-compile + (defconst rfc2047-encoded-word-regexp + "=\\?\\([^][\000-\040()<>@,\;:*\\\"/?.=]+\\)(?:\\*[^?]+\\)?\ +\\?\\(B\\|Q\\)\\?\\([!->@-~ +]*\\)\\?=")) + +;; Fixme: This should decode in place, not cons intermediate strings. +;; Also check whether it needs to worry about delimiting fields like +;; encoding. + +;; In fact it's reported that (invalid) encoding of mailboxes in +;; addr-specs is in use, so delimiting fields might help. Probably +;; not decoding a word which isn't properly delimited is good enough +;; and worthwhile (is it more correct or not?), e.g. something like +;; `=?iso-8859-1?q?foo?=@'. (defun rfc2047-decode-region (start end) "Decode MIME-encoded words in region between START and END." @@ -348,9 +631,10 @@ Should be called narrowed to the head of the message." (goto-char (point-min)) ;; Remove whitespace between encoded words. (while (re-search-forward - (concat "\\(" rfc2047-encoded-word-regexp "\\)" - "\\(\n?[ \t]\\)+" - "\\(" rfc2047-encoded-word-regexp "\\)") + (eval-when-compile + (concat "\\(" rfc2047-encoded-word-regexp "\\)" + "\\(\n?[ \t]\\)+" + "\\(" rfc2047-encoded-word-regexp "\\)")) nil t) (delete-region (goto-char (match-end 1)) (match-beginning 6))) ;; Decode the encoded words. @@ -361,8 +645,17 @@ Should be called narrowed to the head of the message." (prog1 (match-string 0) (delete-region (match-beginning 0) (match-end 0))))) + ;; Remove newlines between decoded words, though such things + ;; essentially must not be there. + (save-restriction + (narrow-to-region e (point)) + (goto-char e) + (while (re-search-forward "[\n\r]+" nil t) + (replace-match " ")) + (goto-char (point-max))) (when (and (mm-multibyte-p) mail-parse-charset + (not (eq mail-parse-charset 'us-ascii)) (not (eq mail-parse-charset 'gnus-decoded))) (mm-decode-coding-region b e mail-parse-charset)) (setq b (point))) @@ -375,41 +668,84 @@ Should be called narrowed to the head of the message." (defun rfc2047-decode-string (string) "Decode the quoted-printable-encoded STRING and return the results." (let ((m (mm-multibyte-p))) - (with-temp-buffer - (when m - (mm-enable-multibyte)) - (insert string) - (inline - (rfc2047-decode-region (point-min) (point-max))) - (buffer-string)))) + (if (string-match "=\\?" string) + (with-temp-buffer + ;; Fixme: This logic is wrong, but seems to be required by + ;; Gnus summary buffer generation. The value of `m' depends + ;; on the current buffer, not global multibyteness or that + ;; of the string. Also the string returned should always be + ;; multibyte in a multibyte session, i.e. the buffer should + ;; be multibyte before `buffer-string' is called. + (when m + (mm-enable-multibyte)) + (insert string) + (inline + (rfc2047-decode-region (point-min) (point-max))) + (buffer-string)) + ;; Fixme: As above, `m' here is inappropriate. + (if (and m + mail-parse-charset + (not (eq mail-parse-charset 'us-ascii)) + (not (eq mail-parse-charset 'gnus-decoded))) + ;; `decode-coding-string' in Emacs offers a third optional + ;; arg NOCOPY to avoid consing a new string if the decoding + ;; is "trivial". Unfortunately it currently doesn't + ;; consider anything else than a `nil' coding system + ;; trivial. + ;; `rfc2047-decode-string' is called multiple times for each + ;; article during summary buffer generation, and we really + ;; want to avoid unnecessary consing. So we bypass + ;; `decode-coding-string' if the string is purely ASCII. + (if (and (fboundp 'detect-coding-string) + ;; string is purely ASCII + (eq (detect-coding-string string t) 'undecided)) + string + (mm-decode-coding-string string mail-parse-charset)) + (mm-string-as-multibyte string))))) (defun rfc2047-parse-and-decode (word) "Decode WORD and return it if it is an encoded word. -Return WORD if not." +Return WORD if it is not not an encoded word or if the charset isn't +decodable." (if (not (string-match rfc2047-encoded-word-regexp word)) word (or (condition-case nil (rfc2047-decode (match-string 1 word) - (upcase (match-string 2 word)) + (string-to-char (match-string 2 word)) (match-string 3 word)) (error word)) - word))) + word))) ; un-decodable + +(defun rfc2047-pad-base64 (string) + "Pad STRING to quartets." + ;; Be more liberal to accept buggy base64 strings. If + ;; base64-decode-string accepts buggy strings, this function could + ;; be aliased to identity. + (if (= 0 (mod (length string) 4)) + string + (when (string-match "=+$" string) + (setq string (substring string 0 (match-beginning 0)))) + (case (mod (length string) 4) + (0 string) + (1 string) ;; Error, don't pad it. + (2 (concat string "==")) + (3 (concat string "="))))) (defun rfc2047-decode (charset encoding string) - "Decode STRING that uses CHARSET with ENCODING. -Valid ENCODINGs are \"B\" and \"Q\". -If your Emacs implementation can't decode CHARSET, it returns nil." + "Decode STRING from the given MIME CHARSET in the given ENCODING. +Valid ENCODINGs are the characters \"B\" and \"Q\". +If your Emacs implementation can't decode CHARSET, return nil." (if (stringp charset) (setq charset (intern (downcase charset)))) - (if (or (not charset) + (if (or (not charset) (eq 'gnus-all mail-parse-ignored-charsets) (memq 'gnus-all mail-parse-ignored-charsets) (memq charset mail-parse-ignored-charsets)) (setq charset mail-parse-charset)) (let ((cs (mm-charset-to-coding-system charset))) - (if (and (not cs) charset + (if (and (not cs) charset (listp mail-parse-ignored-charsets) (memq 'gnus-unknown mail-parse-ignored-charsets)) (setq cs (mm-charset-to-coding-system mail-parse-charset))) @@ -419,12 +755,13 @@ If your Emacs implementation can't decode CHARSET, it returns nil." (setq cs mail-parse-charset)) (mm-decode-coding-string (cond - ((equal "B" encoding) - (base64-decode-string string)) - ((equal "Q" encoding) + ((char-equal ?B encoding) + (base64-decode-string + (rfc2047-pad-base64 string))) + ((char-equal ?Q encoding) (quoted-printable-decode-string - (mm-replace-chars-in-string string ?_ ? ))) - (t (error "Invalid encoding: %s" encoding))) + (mm-subst-char-in-string ?_ ? string t))) + (t (error "Invalid encoding: %c" encoding))) cs)))) (provide 'rfc2047)