X-Git-Url: http://cgit.sxemacs.org/?p=gnus;a=blobdiff_plain;f=lisp%2Frfc2047.el;h=addca6354ef7bbf1bdca26f77173e45edf55f2c5;hp=924f7ab83a18ae8b8b211ea8558b9ae01eed5337;hb=646078e0f502997011151ce61733b4832f3b7d18;hpb=748d0c31bffe1bb503a9eb2b195119349f166c53 diff --git a/lisp/rfc2047.el b/lisp/rfc2047.el index 924f7ab83..addca6354 100644 --- a/lisp/rfc2047.el +++ b/lisp/rfc2047.el @@ -1,5 +1,7 @@ -;;; rfc2047.el --- Functions for encoding and decoding rfc2047 messages -;; Copyright (C) 1998, 1999, 2000, 2001, 2002 Free Software Foundation, Inc. +;;; rfc2047.el --- functions for encoding and decoding rfc2047 messages + +;; Copyright (C) 1998, 1999, 2000, 2002, 2003, 2004, 2005 +;; Free Software Foundation, Inc. ;; Author: Lars Magne Ingebrigtsen ;; MORIOKA Tomohiko @@ -34,18 +36,17 @@ (require 'qp) (require 'mm-util) (require 'ietf-drums) +;; Fixme: Avoid this (used for mail-parse-charset) mm dependence on gnus. (require 'mail-prsvr) (require 'base64) -;; Fixme: Avoid this (for gnus-point-at-...) mm dependence on gnus. -(require 'gnus-util) (autoload 'mm-body-7-or-8 "mm-bodies") (defvar rfc2047-header-encoding-alist '(("Newsgroups" . nil) ("Followup-To" . nil) ("Message-ID" . nil) - ("\\(Resent-\\)?\\(From\\|Cc\\|To\\|Bcc\\|Reply-To\\|Sender\\)" . - "-A-Za-z0-9!*+/=_") + ("\\(Resent-\\)?\\(From\\|Cc\\|To\\|Bcc\\|\\(In-\\)?Reply-To\\|Sender\ +\\|Mail-Followup-To\\|Mail-Copies-To\\|Approved\\)" . address-mime) (t . mime)) "*Header/encoding method alist. The list is traversed sequentially. The keys can either be @@ -55,10 +56,11 @@ The values can be: 1) nil, in which case no encoding is done; 2) `mime', in which case the header will be encoded according to RFC2047; -3) a charset, in which case it will be encoded as that charset; -4) `default', in which case the field will be encoded as the rest - of the article. -5) a string, like `mime', expect for using it as word-chars.") +3) `address-mime', like `mime', but takes account of the rules for address + fields (where quoted strings and comments must be treated separately); +4) a charset, in which case it will be encoded as that charset; +5) `default', in which case the field will be encoded as the rest + of the article.") (defvar rfc2047-charset-encoding-alist '((us-ascii . nil) @@ -82,30 +84,44 @@ The values can be: (cn-gb-2312 . B) (euc-kr . B) (iso-2022-jp-2 . B) - (iso-2022-int-1 . B)) + (iso-2022-int-1 . B) + (viscii . Q)) "Alist of MIME charsets to RFC2047 encodings. Valid encodings are nil, `Q' and `B'. These indicate binary (no) encoding, quoted-printable and base64 respectively.") -(defvar rfc2047-encoding-function-alist - '((Q . rfc2047-q-encode-region) - (B . rfc2047-b-encode-region) - (nil . ignore)) +(defvar rfc2047-encode-function-alist + '((Q . rfc2047-q-encode-string) + (B . rfc2047-b-encode-string) + (nil . identity)) "Alist of RFC2047 encodings to encoding functions.") -(defvar rfc2047-q-encoding-alist - '(("\\(Resent-\\)?\\(From\\|Cc\\|To\\|Bcc\\|Reply-To\\|Sender\\):" - . "-A-Za-z0-9!*+/" ) - ;; = (\075), _ (\137), ? (\077) are used in the encoded word. - ;; Avoid using 8bit characters. - ;; Equivalent to "^\000-\007\011\013\015-\037\200-\377=_?" - ("." . "\010\012\014\040-\074\076\100-\136\140-\177")) - "Alist of header regexps and valid Q characters.") +(defvar rfc2047-encode-encoded-words t + "Whether encoded words should be encoded again.") ;;; ;;; Functions for encoding RFC2047 messages ;;; +(defun rfc2047-qp-or-base64 () + "Return the type with which to encode the buffer. +This is either `base64' or `quoted-printable'." + (save-excursion + (let ((limit (min (point-max) (+ 2000 (point-min)))) + (n8bit 0)) + (goto-char (point-min)) + (skip-chars-forward "\x20-\x7f\r\n\t" limit) + (while (< (point) limit) + (incf n8bit) + (forward-char 1) + (skip-chars-forward "\x20-\x7f\r\n\t" limit)) + (if (or (< (* 6 n8bit) (- limit (point-min))) + ;; Don't base64, say, a short line with a single + ;; non-ASCII char when splitting parts by charset. + (= n8bit 1)) + 'quoted-printable + 'base64)))) + (defun rfc2047-narrow-to-field () "Narrow the buffer to the header on the current line." (beginning-of-line) @@ -114,9 +130,7 @@ quoted-printable and base64 respectively.") (progn (forward-line 1) (if (re-search-forward "^[^ \n\t]" nil t) - (progn - (beginning-of-line) - (point)) + (point-at-bol) (point-max)))) (goto-char (point-min))) @@ -126,7 +140,13 @@ quoted-printable and base64 respectively.") (save-restriction (rfc2047-narrow-to-field) (re-search-forward ":[ \t\n]*" nil t) - (buffer-substring (point) (point-max))))) + (buffer-substring-no-properties (point) (point-max))))) + +(defvar rfc2047-encoding-type 'address-mime + "The type of encoding done by `rfc2047-encode-region'. +This should be dynamically bound around calls to +`rfc2047-encode-region' to either `mime' or `address-mime'. See +`rfc2047-header-encoding-alist', for definitions.") (defun rfc2047-encode-message-header () "Encode the message header according to `rfc2047-header-encoding-alist'. @@ -140,25 +160,25 @@ Should be called narrowed to the head of the message." (rfc2047-narrow-to-field) (if (not (rfc2047-encodable-p)) (prog1 - (if (and (eq (mm-body-7-or-8) '8bit) - (mm-multibyte-p) - (mm-coding-system-p - (car message-posting-charset))) - ;; 8 bit must be decoded. - ;; Is message-posting-charset a coding system? - (mm-encode-coding-region - (point-min) (point-max) - (car message-posting-charset)) - nil) + (if (and (eq (mm-body-7-or-8) '8bit) + (mm-multibyte-p) + (mm-coding-system-p + (car message-posting-charset))) + ;; 8 bit must be decoded. + (mm-encode-coding-region + (point-min) (point-max) + (mm-charset-to-coding-system + (car message-posting-charset)))) ;; No encoding necessary, but folding is nice - (rfc2047-fold-region - (save-excursion - (goto-char (point-min)) - (skip-chars-forward "^:") - (when (looking-at ": ") - (forward-char 2)) - (point)) - (point-max))) + (when nil + (rfc2047-fold-region + (save-excursion + (goto-char (point-min)) + (skip-chars-forward "^:") + (when (looking-at ": ") + (forward-char 2)) + (point)) + (point-max)))) ;; We found something that may perhaps be encoded. (setq method nil alist rfc2047-header-encoding-alist) @@ -168,46 +188,45 @@ Should be called narrowed to the head of the message." (eq (car elem) t)) (setq alist nil method (cdr elem)))) + (re-search-forward "^[^:]+: *" nil t) (cond - ((stringp method) - (rfc2047-encode-region (point-min) (point-max) method)) + ((eq method 'address-mime) + (rfc2047-encode-region (point) (point-max))) ((eq method 'mime) - (rfc2047-encode-region (point-min) (point-max))) + (let ((rfc2047-encoding-type 'mime)) + (rfc2047-encode-region (point) (point-max)))) ((eq method 'default) (if (and (featurep 'mule) (if (boundp 'default-enable-multibyte-characters) default-enable-multibyte-characters) mail-parse-charset) - (mm-encode-coding-region (point-min) (point-max) + (mm-encode-coding-region (point) (point-max) mail-parse-charset))) ;; We get this when CC'ing messsages to newsgroups with - ;; 8-bit names. The group name mail copy just get + ;; 8-bit names. The group name mail copy just got ;; unconditionally encoded. Previously, it would ask ;; whether to encode, which was quite confusing for the ;; user. If the new behaviour is wrong, tell me. I have ;; left the old code commented out below. ;; -- Per Abrahamsen Date: 2001-10-07. + ;; Modified by Dave Love, with the commented-out code changed + ;; in accordance with changes elsewhere. ((null method) - (when (delq 'ascii - (mm-find-charset-region (point-min) (point-max))) - (rfc2047-encode-region (point-min) (point-max)))) -;;; ((null method) -;;; (and (delq 'ascii -;;; (mm-find-charset-region (point-min) -;;; (point-max))) -;;; (if (or (message-options-get -;;; 'rfc2047-encode-message-header-encode-any) -;;; (message-options-set -;;; 'rfc2047-encode-message-header-encode-any -;;; (y-or-n-p -;;; "Some texts are not encoded. Encode anyway?"))) -;;; (rfc2047-encode-region (point-min) (point-max)) -;;; (error "Cannot send unencoded text")))) + (rfc2047-encode-region (point) (point-max))) +;;; ((null method) +;;; (if (or (message-options-get +;;; 'rfc2047-encode-message-header-encode-any) +;;; (message-options-set +;;; 'rfc2047-encode-message-header-encode-any +;;; (y-or-n-p +;;; "Some texts are not encoded. Encode anyway?"))) +;;; (rfc2047-encode-region (point-min) (point-max)) +;;; (error "Cannot send unencoded text"))) ((mm-coding-system-p method) (if (and (featurep 'mule) (if (boundp 'default-enable-multibyte-characters) default-enable-multibyte-characters)) - (mm-encode-coding-region (point-min) (point-max) method))) + (mm-encode-coding-region (point) (point-max) method))) ;; Hm. (t))) (goto-char (point-max))))))) @@ -221,137 +240,393 @@ Should be called narrowed to the head of the message." The buffer may be narrowed." (require 'message) ; for message-posting-charset (let ((charsets - (mapcar - 'mm-mime-charset - (mm-find-charset-region (point-min) (point-max)))) - (cs (list 'us-ascii (car message-posting-charset))) - found) - (while charsets - (unless (memq (pop charsets) cs) - (setq found t))) - found)) - -(defun rfc2047-dissect-region (b e &optional word-chars) - "Dissect the region between B and E into words." - (unless word-chars - ;; Anything except most CTLs, WSP - (setq word-chars "\010\012\014\041-\177")) - (let (mail-parse-mule-charset - words point current - result word) - (save-restriction - (narrow-to-region b e) - (goto-char (point-min)) - (skip-chars-forward "\000-\177") - (while (not (eobp)) - (setq point (point)) - (skip-chars-backward word-chars b) - (unless (eq b (point)) - (push (cons (buffer-substring b (point)) nil) words)) - (setq b (point)) - (goto-char point) - (setq current (mm-charset-after)) - (forward-char 1) - (skip-chars-forward word-chars) - (while (and (not (eobp)) - (eq (mm-charset-after) current)) - (forward-char 1) - (skip-chars-forward word-chars)) - (unless (eq b (point)) - (push (cons (buffer-substring b (point)) current) words)) - (setq b (point)) - (skip-chars-forward "\000-\177")) - (unless (eq b (point)) - (push (cons (buffer-substring b (point)) nil) words))) - ;; merge adjacent words - (setq word (pop words)) - (while word - (if (and (cdr word) - (caar words) - (not (cdar words)) - (not (string-match "[^ \t]" (caar words)))) - (if (eq (cdr (nth 1 words)) (cdr word)) - (progn - (setq word (cons (concat - (car (nth 1 words)) (caar words) - (car word)) - (cdr word))) - (pop words) - (pop words)) - (push (cons (concat (caar words) (car word)) (cdr word)) - result) - (pop words) - (setq word (pop words))) - (push word result) - (setq word (pop words)))) - result)) - -(defun rfc2047-encode-region (b e &optional word-chars) - "Encode all encodable words in region B to E." - (let ((words (rfc2047-dissect-region b e word-chars)) word) - (save-restriction - (narrow-to-region b e) - (delete-region (point-min) (point-max)) - (while (setq word (pop words)) - (if (not (cdr word)) - (insert (car word)) - (rfc2047-fold-region (gnus-point-at-bol) (point)) - (goto-char (point-max)) - (if (> (- (point) (save-restriction - (widen) - (gnus-point-at-bol))) 76) - (insert "\n ")) - ;; Insert blank between encoded words - (if (eq (char-before) ?=) (insert " ")) - (rfc2047-encode (point) - (progn (insert (car word)) (point)) - (cdr word)))) - (rfc2047-fold-region (point-min) (point-max))))) - -(defun rfc2047-encode-string (string &optional word-chars) - "Encode words in STRING." - (with-temp-buffer + (mm-find-mime-charset-region (point-min) (point-max)))) + (goto-char (point-min)) + (or (and rfc2047-encode-encoded-words + (prog1 + (search-forward "=?" nil t) + (goto-char (point-min)))) + (and charsets + (not (equal charsets (list (car message-posting-charset)))))))) + +;; Use this syntax table when parsing into regions that may need +;; encoding. Double quotes are string delimiters, backslash is +;; character quoting, and all other RFC 2822 special characters are +;; treated as punctuation so we can use forward-sexp/forward-word to +;; skip to the end of regions appropriately. Nb. ietf-drums does +;; things differently. +(defconst rfc2047-syntax-table + ;; (make-char-table 'syntax-table '(2)) only works in Emacs. + (let ((table (make-syntax-table))) + ;; The following is done to work for setting all elements of the table + ;; in Emacs 21 and 22 and XEmacs; it appears to be the cleanest way. + ;; Play safe and don't assume the form of the word syntax entry -- + ;; copy it from ?a. + (if (fboundp 'set-char-table-range) ; Emacs + (funcall (intern "set-char-table-range") + table t (aref (standard-syntax-table) ?a)) + (if (fboundp 'put-char-table) + (if (fboundp 'get-char-table) ; warning avoidance + (put-char-table t (get-char-table ?a (standard-syntax-table)) + table)))) + (modify-syntax-entry ?\\ "\\" table) + (modify-syntax-entry ?\" "\"" table) + (modify-syntax-entry ?\( "(" table) + (modify-syntax-entry ?\) ")" table) + (modify-syntax-entry ?\< "." table) + (modify-syntax-entry ?\> "." table) + (modify-syntax-entry ?\[ "." table) + (modify-syntax-entry ?\] "." table) + (modify-syntax-entry ?: "." table) + (modify-syntax-entry ?\; "." table) + (modify-syntax-entry ?, "." table) + (modify-syntax-entry ?@ "." table) + table)) + +(defun rfc2047-encode-region (b e) + "Encode words in region B to E that need encoding. +By default, the region is treated as containing RFC2822 addresses. +Dynamically bind `rfc2047-encoding-type' to change that." + (save-restriction + (narrow-to-region b e) + (let ((encodable-regexp (if rfc2047-encode-encoded-words + "[^\000-\177]+\\|=\\?" + "[^\000-\177]+")) + start ; start of current token + end begin csyntax + ;; Whether there's an encoded word before the current token, + ;; either immediately or separated by space. + last-encoded + (orig-text (buffer-substring-no-properties b e))) + (if (eq 'mime rfc2047-encoding-type) + ;; Simple case. Continuous words in which all those contain + ;; non-ASCII characters are encoded collectively. Encoding + ;; ASCII words, including `Re:' used in Subject headers, is + ;; avoided for interoperability with non-MIME clients and + ;; for making it easy to find keywords. + (progn + (goto-char (point-min)) + (while (progn (skip-chars-forward " \t\n") + (not (eobp))) + (setq start (point)) + (while (and (looking-at "[ \t\n]*\\([^ \t\n]+\\)") + (progn + (setq end (match-end 0)) + (re-search-forward encodable-regexp end t))) + (goto-char end)) + (if (> (point) start) + (rfc2047-encode start (point)) + (goto-char end)))) + ;; `address-mime' case -- take care of quoted words, comments. + (with-syntax-table rfc2047-syntax-table + (goto-char (point-min)) + (condition-case err ; in case of unbalanced quotes + ;; Look for rfc2822-style: sequences of atoms, quoted + ;; strings, specials, whitespace. (Specials mustn't be + ;; encoded.) + (while (not (eobp)) + ;; Skip whitespace. + (skip-chars-forward " \t\n") + (setq start (point)) + (cond + ((not (char-after))) ; eob + ;; else token start + ((eq ?\" (setq csyntax (char-syntax (char-after)))) + ;; Quoted word. + (forward-sexp) + (setq end (point)) + ;; Does it need encoding? + (goto-char start) + (if (re-search-forward encodable-regexp end 'move) + ;; It needs encoding. Strip the quotes first, + ;; since encoded words can't occur in quotes. + (progn + (goto-char end) + (delete-backward-char 1) + (goto-char start) + (delete-char 1) + (when last-encoded + ;; There was a preceding quoted word. We need + ;; to include any separating whitespace in this + ;; word to avoid it getting lost. + (skip-chars-backward " \t") + ;; A space is needed between the encoded words. + (insert ? ) + (setq start (point) + end (1+ end))) + ;; Adjust the end position for the deleted quotes. + (rfc2047-encode start (- end 2)) + (setq last-encoded t)) ; record that it was encoded + (setq last-encoded nil))) + ((eq ?. csyntax) + ;; Skip other delimiters, but record that they've + ;; potentially separated quoted words. + (forward-char) + (setq last-encoded nil)) + ((eq ?\) csyntax) + (error "Unbalanced parentheses")) + ((eq ?\( csyntax) + ;; Look for the end of parentheses. + (forward-list) + ;; Encode text as an unstructured field. + (let ((rfc2047-encoding-type 'mime)) + (rfc2047-encode-region (1+ start) (1- (point)))) + (skip-chars-forward ")")) + (t ; normal token/whitespace sequence + ;; Find the end. + ;; Skip one ASCII word, or encode continuous words + ;; in which all those contain non-ASCII characters. + (setq end nil) + (while (not (or end (eobp))) + (when (looking-at "[\000-\177]+") + (setq begin (point) + end (match-end 0)) + (when (progn + (while (and (or (re-search-forward + "[ \t\n]\\|\\Sw" end 'move) + (setq end nil)) + (eq ?\\ (char-syntax (char-before)))) + ;; Skip backslash-quoted characters. + (forward-char)) + end) + (setq end (match-beginning 0)) + (if rfc2047-encode-encoded-words + (progn + (goto-char begin) + (when (search-forward "=?" end 'move) + (goto-char (match-beginning 0)) + (setq end nil))) + (goto-char end)))) + ;; Where the value nil of `end' means there may be + ;; text to have to be encoded following the point. + ;; Otherwise, the point reached to the end of ASCII + ;; words separated by whitespace or a special char. + (unless end + (when (looking-at encodable-regexp) + (goto-char (setq begin (match-end 0))) + (while (and (looking-at "[ \t\n]+\\([^ \t\n]+\\)") + (setq end (match-end 0)) + (progn + (while (re-search-forward + encodable-regexp end t)) + (< begin (point))) + (goto-char begin) + (or (not (re-search-forward "\\Sw" end t)) + (progn + (goto-char (match-beginning 0)) + nil))) + (goto-char end)) + (when (looking-at "[^ \t\n]+") + (setq end (match-end 0)) + (if (re-search-forward "\\Sw+" end t) + ;; There are special characters better + ;; to be encoded so that MTAs may parse + ;; them safely. + (cond ((= end (point))) + ((looking-at (concat "\\sw*\\(" + encodable-regexp + "\\)")) + (setq end nil)) + (t + (goto-char (1- (match-end 0))) + (unless (= (point) (match-beginning 0)) + ;; Separate encodable text and + ;; delimiter. + (insert " ")))) + (goto-char end) + (skip-chars-forward " \t\n") + (if (and (looking-at "[^ \t\n]+") + (string-match encodable-regexp + (match-string 0))) + (setq end nil) + (goto-char end))))))) + (skip-chars-backward " \t\n") + (setq end (point)) + (goto-char start) + (if (re-search-forward encodable-regexp end 'move) + (progn + (unless (memq (char-before start) '(nil ?\t ? )) + (if (progn + (goto-char start) + (skip-chars-backward "^ \t\n") + (and (looking-at "\\Sw+") + (= (match-end 0) start))) + ;; Also encode bogus delimiters. + (setq start (point)) + ;; Separate encodable text and delimiter. + (goto-char start) + (insert " ") + (setq start (1+ start) + end (1+ end)))) + (rfc2047-encode start end) + (setq last-encoded t)) + (setq last-encoded nil))))) + (error + (if (or debug-on-quit debug-on-error) + (signal (car err) (cdr err)) + (error "Invalid data for rfc2047 encoding: %s" + (mm-replace-in-string orig-text "[ \t\n]+" " ")))))))) + (rfc2047-fold-region b (point)) + (goto-char (point-max)))) + +(defun rfc2047-encode-string (string) + "Encode words in STRING. +By default, the string is treated as containing addresses (see +`rfc2047-encoding-type')." + (mm-with-multibyte-buffer (insert string) - (rfc2047-encode-region (point-min) (point-max) word-chars) + (rfc2047-encode-region (point-min) (point-max)) (buffer-string))) -(defun rfc2047-encode (b e charset) - "Encode the word in the region B to E with CHARSET." - (let* ((mime-charset (mm-mime-charset charset)) - (cs (mm-charset-to-coding-system mime-charset)) - (encoding (or (cdr (assq mime-charset +(defvar rfc2047-encode-max-chars 76 + "Maximum characters of each header line that contain encoded-words. +If it is nil, encoded-words will not be folded. Too small value may +cause an error. Don't change this for no particular reason.") + +(defun rfc2047-encode-1 (column string cs encoder start crest tail + &optional eword) + "Subroutine used by `rfc2047-encode'." + (cond ((string-equal string "") + (or eword "")) + ((not rfc2047-encode-max-chars) + (concat start + (funcall encoder (if cs + (mm-encode-coding-string string cs) + string)) + "?=")) + ((>= column rfc2047-encode-max-chars) + (when eword + (cond ((string-match "\n[ \t]+\\'" eword) + ;; Reomove a superfluous empty line. + (setq eword (substring eword 0 (match-beginning 0)))) + ((string-match "(+\\'" eword) + ;; Break the line before the open parenthesis. + (setq crest (concat crest (match-string 0 eword)) + eword (substring eword 0 (match-beginning 0)))))) + (rfc2047-encode-1 (length crest) string cs encoder start " " tail + (concat eword "\n" crest))) + (t + (let ((index 0) + (limit (1- (length string))) + (prev "") + next len) + (while (and prev + (<= index limit)) + (setq next (concat start + (funcall encoder + (if cs + (mm-encode-coding-string + (substring string 0 (1+ index)) + cs) + (substring string 0 (1+ index)))) + "?=") + len (+ column (length next))) + (if (> len rfc2047-encode-max-chars) + (setq next prev + prev nil) + (if (or (< index limit) + (<= (+ len (or (string-match "\n" tail) + (length tail))) + rfc2047-encode-max-chars)) + (setq prev next + index (1+ index)) + (if (string-match "\\`)+" tail) + ;; Break the line after the close parenthesis. + (setq tail (concat (substring tail 0 (match-end 0)) + "\n " + (substring tail (match-end 0))) + prev next + index (1+ index)) + (setq next prev + prev nil))))) + (if (> index limit) + (concat eword next tail) + (if (= 0 index) + (if (and eword + (string-match "(+\\'" eword)) + (setq crest (concat crest (match-string 0 eword)) + eword (substring eword 0 (match-beginning 0))) + (setq eword (concat eword next))) + (setq crest " " + eword (concat eword next))) + (when (string-match "\n[ \t]+\\'" eword) + ;; Reomove a superfluous empty line. + (setq eword (substring eword 0 (match-beginning 0)))) + (rfc2047-encode-1 (length crest) (substring string index) + cs encoder start " " tail + (concat eword "\n" crest))))))) + +(defun rfc2047-encode (b e) + "Encode the word(s) in the region B to E. +Point moves to the end of the region." + (let ((mime-charset (or (mm-find-mime-charset-region b e) (list 'us-ascii))) + cs encoding tail crest eword) + (cond ((> (length mime-charset) 1) + (error "Can't rfc2047-encode `%s'" + (buffer-substring-no-properties b e))) + ((= (length mime-charset) 1) + (setq mime-charset (car mime-charset) + cs (mm-charset-to-coding-system mime-charset)) + (unless (and (mm-multibyte-p) + (mm-coding-system-p cs)) + (setq cs nil)) + (save-restriction + (narrow-to-region b e) + (setq encoding + (or (cdr (assq mime-charset rfc2047-charset-encoding-alist)) - 'B)) - (start (concat - "=?" (downcase (symbol-name mime-charset)) "?" - (downcase (symbol-name encoding)) "?")) - (first t)) - (save-restriction - (narrow-to-region b e) - (when (eq encoding 'B) - ;; break into lines before encoding - (goto-char (point-min)) - (while (not (eobp)) - (goto-char (min (point-max) (+ 15 (point)))) - (unless (eobp) - (insert "\n")))) - (if (and (mm-multibyte-p) - (mm-coding-system-p cs)) - (mm-encode-coding-region (point-min) (point-max) cs)) - (funcall (cdr (assq encoding rfc2047-encoding-function-alist)) - (point-min) (point-max)) - (goto-char (point-min)) - (while (not (eobp)) - (unless first - (insert " ")) - (setq first nil) - (insert start) - (end-of-line) - (insert "?=") - (forward-line 1))))) + ;; For the charsets that don't have a preferred + ;; encoding, choose the one that's shorter. + (if (eq (rfc2047-qp-or-base64) 'base64) + 'B + 'Q))) + (widen) + (goto-char e) + (skip-chars-forward "^ \t\n") + ;; `tail' may contain a close parenthesis. + (setq tail (buffer-substring-no-properties e (point))) + (goto-char b) + (setq b (point-marker) + e (set-marker (make-marker) e)) + (rfc2047-fold-region (point-at-bol) b) + (goto-char b) + (skip-chars-backward "^ \t\n") + (unless (= 0 (skip-chars-backward " \t")) + ;; `crest' may contain whitespace and an open parenthesis. + (setq crest (buffer-substring-no-properties (point) b))) + (setq eword (rfc2047-encode-1 + (- b (point-at-bol)) + (mm-replace-in-string + (buffer-substring-no-properties b e) + "\n\\([ \t]?\\)" "\\1") + cs + (or (cdr (assq encoding + rfc2047-encode-function-alist)) + 'identity) + (concat "=?" (downcase (symbol-name mime-charset)) + "?" (upcase (symbol-name encoding)) "?") + (or crest " ") + tail)) + (delete-region (if (eq (aref eword 0) ?\n) + (if (bolp) + ;; The line was folded before encoding. + (1- (point)) + (point)) + (goto-char b)) + (+ e (length tail))) + ;; `eword' contains `crest' and `tail'. + (insert eword) + (set-marker b nil) + (set-marker e nil) + (unless (or (/= 0 (length tail)) + (eobp) + (looking-at "[ \t\n)]")) + (insert " ")))) + (t + (goto-char e))))) (defun rfc2047-fold-field () - "Fold the current line." + "Fold the current header field." (save-excursion (save-restriction (rfc2047-narrow-to-field) @@ -367,15 +642,16 @@ The buffer may be narrowed." (first t) (bol (save-restriction (widen) - (gnus-point-at-bol)))) + (point-at-bol)))) (while (not (eobp)) (when (and (or break qword-break) (> (- (point) bol) 76)) (goto-char (or break qword-break)) (setq break nil qword-break nil) + (skip-chars-backward " \t") (if (looking-at "[ \t]") - (insert "\n") + (insert ?\n) (insert "\n ")) (setq bol (1- (point))) ;; Don't break before the first non-LWSP characters. @@ -395,27 +671,29 @@ The buffer may be narrowed." (forward-char 1)) ((memq (char-after) '(? ?\t)) (skip-chars-forward " \t") - (if first - ;; Don't break just after the header name. - (setq first nil) - (setq break (1- (point))))) + (unless first ;; Don't break just after the header name. + (setq break (point)))) ((not break) (if (not (looking-at "=\\?[^=]")) (if (eq (char-after) ?=) (forward-char 1) (skip-chars-forward "^ \t\n\r=")) - (setq qword-break (point)) + ;; Don't break at the start of the field. + (unless (= (point) b) + (setq qword-break (point))) (skip-chars-forward "^ \t\n\r"))) (t - (skip-chars-forward "^ \t\n\r")))) + (skip-chars-forward "^ \t\n\r"))) + (setq first nil)) (when (and (or break qword-break) (> (- (point) bol) 76)) (goto-char (or break qword-break)) (setq break nil qword-break nil) - (if (looking-at "[ \t]") - (insert "\n") - (insert "\n ")) + (if (or (> 0 (skip-chars-backward " \t")) + (looking-at "[ \t]")) + (insert ?\n) + (insert "\n ")) (setq bol (1- (point))) ;; Don't break before the first non-LWSP characters. (skip-chars-forward " \t") @@ -436,66 +714,87 @@ The buffer may be narrowed." (goto-char (point-min)) (let ((bol (save-restriction (widen) - (gnus-point-at-bol))) - (eol (gnus-point-at-eol))) + (point-at-bol))) + (eol (point-at-eol))) (forward-line 1) (while (not (eobp)) (if (and (looking-at "[ \t]") - (< (- (gnus-point-at-eol) bol) 76)) + (< (- (point-at-eol) bol) 76)) (delete-region eol (progn (goto-char eol) (skip-chars-forward "\r\n") (point))) - (setq bol (gnus-point-at-bol))) - (setq eol (gnus-point-at-eol)) + (setq bol (point-at-bol))) + (setq eol (point-at-eol)) (forward-line 1))))) -(defun rfc2047-b-encode-region (b e) - "Base64-encode the header contained in region B to E." - (save-restriction - (narrow-to-region (goto-char b) e) - (while (not (eobp)) - (base64-encode-region (point) (progn (end-of-line) (point)) t) - (if (and (bolp) (eolp)) - (delete-backward-char 1)) - (forward-line)))) - -(defun rfc2047-q-encode-region (b e) - "Quoted-printable-encode the header in region B to E." - (save-excursion - (save-restriction - (narrow-to-region (goto-char b) e) - (let ((alist rfc2047-q-encoding-alist) - (bol (save-restriction - (widen) - (gnus-point-at-bol)))) - (while alist - (when (looking-at (caar alist)) - (mm-with-unibyte-current-buffer-mule4 - (quoted-printable-encode-region - (point-min) (point-max) nil (cdar alist))) - (subst-char-in-region (point-min) (point-max) ? ?_) - (setq alist nil)) - (pop alist)) - ;; The size of QP encapsulation is about 20, so set limit to - ;; 56=76-20. - (unless (< (- (point-max) (point-min)) 56) - ;; Don't break if it could fit in one line. - ;; Let rfc2047-encode-region break it later. - (goto-char (1+ (point-min))) - (while (and (not (bobp)) (not (eobp))) - (goto-char (min (point-max) (+ 56 bol))) - (search-backward "=" (- (point) 2) t) - (unless (or (bobp) (eobp)) - (insert "\n") - (setq bol (point))))))))) +(defun rfc2047-b-encode-string (string) + "Base64-encode the header contained in STRING." + (base64-encode-string string t)) + +(defun rfc2047-q-encode-string (string) + "Quoted-printable-encode the header in STRING." + (mm-with-unibyte-buffer + (insert string) + (quoted-printable-encode-region + (point-min) (point-max) nil + ;; = (\075), _ (\137), ? (\077) are used in the encoded word. + ;; Avoid using 8bit characters. + ;; This list excludes `especials' (see the RFC2047 syntax), + ;; meaning that some characters in non-structured fields will + ;; get encoded when they con't need to be. The following is + ;; what it used to be. + ;;; ;; Equivalent to "^\000-\007\011\013\015-\037\200-\377=_?" + ;;; "\010\012\014\040-\074\076\100-\136\140-\177") + "-\b\n\f !#-'*+0-9A-Z\\^`-~\d") + (subst-char-in-region (point-min) (point-max) ? ?_) + (buffer-string))) + +(defun rfc2047-encode-parameter (param value) + "Return and PARAM=VALUE string encoded in the RFC2047-like style. +This is a replacement for the `rfc2231-encode-string' function. + +When attaching files as MIME parts, we should use the RFC2231 encoding +to specify the file names containing non-ASCII characters. However, +many mail softwares don't support it in practice and recipients won't +be able to extract files with correct names. Instead, the RFC2047-like +encoding is acceptable generally. This function provides the very +RFC2047-like encoding, resigning to such a regrettable trend. To use +it, put the following line in your ~/.gnus.el file: + +\(defalias 'mail-header-encode-parameter 'rfc2047-encode-parameter) +" + (let* ((rfc2047-encoding-type 'mime) + (rfc2047-encode-max-chars nil) + (string (rfc2047-encode-string value))) + (if (string-match (concat "[" ietf-drums-tspecials "]") string) + (format "%s=%S" param string) + (concat param "=" string)))) ;;; ;;; Functions for decoding RFC2047 messages ;;; -(defvar rfc2047-encoded-word-regexp - "=\\?\\([^][\000-\040()<>@,\;:\\\"/?.=]+\\)\\?\\(B\\|Q\\)\\?\\([!->@-~ +]*\\)\\?=") +(eval-and-compile + (defconst rfc2047-encoded-word-regexp + "=\\?\\([^][\000-\040()<>@,\;:*\\\"/?.=]+\\)\\(?:\\*[^?]+\\)?\ +\\?\\(B\\|Q\\)\\?\\([!->@-~ ]*\\)\\?=")) + +(defvar rfc2047-quote-decoded-words-containing-tspecials nil + "If non-nil, quote decoded words containing special characters.") + +(defvar rfc2047-quote-decoded-words-containing-tspecials nil + "If non-nil, quote decoded words containing special characters.") + +;; Fixme: This should decode in place, not cons intermediate strings. +;; Also check whether it needs to worry about delimiting fields like +;; encoding. + +;; In fact it's reported that (invalid) encoding of mailboxes in +;; addr-specs is in use, so delimiting fields might help. Probably +;; not decoding a word which isn't properly delimited is good enough +;; and worthwhile (is it more correct or not?), e.g. something like +;; `=?iso-8859-1?q?foo?=@'. (defun rfc2047-decode-region (start end) "Decode MIME-encoded words in region between START and END." @@ -508,9 +807,10 @@ The buffer may be narrowed." (goto-char (point-min)) ;; Remove whitespace between encoded words. (while (re-search-forward - (concat "\\(" rfc2047-encoded-word-regexp "\\)" - "\\(\n?[ \t]\\)+" - "\\(" rfc2047-encoded-word-regexp "\\)") + (eval-when-compile + (concat "\\(" rfc2047-encoded-word-regexp "\\)" + "\\(\n?[ \t]\\)+" + "\\(" rfc2047-encoded-word-regexp "\\)")) nil t) (delete-region (goto-char (match-end 1)) (match-beginning 6))) ;; Decode the encoded words. @@ -520,14 +820,66 @@ The buffer may be narrowed." (insert (rfc2047-parse-and-decode (prog1 (match-string 0) - (delete-region (match-beginning 0) (match-end 0))))) - ;; Remove newlines between decoded words. Though such things - ;; must not be essentially there. + (delete-region e (match-end 0))))) + (while (looking-at rfc2047-encoded-word-regexp) + (insert (rfc2047-parse-and-decode + (prog1 + (match-string 0) + (delete-region (point) (match-end 0)))))) (save-restriction (narrow-to-region e (point)) (goto-char e) + ;; Remove newlines between decoded words, though such + ;; things essentially must not be there. (while (re-search-forward "[\n\r]+" nil t) (replace-match " ")) + ;; Quote decoded words if there are special characters + ;; which might violate RFC2822. + (when (and rfc2047-quote-decoded-words-containing-tspecials + (let ((regexp (car (rassq + 'address-mime + rfc2047-header-encoding-alist)))) + (when regexp + (save-restriction + (widen) + (beginning-of-line) + (while (and (memq (char-after) '(? ?\t)) + (zerop (forward-line -1)))) + (looking-at regexp))))) + (let (quoted) + (goto-char e) + (skip-chars-forward " \t") + (setq start (point)) + (setq quoted (eq (char-after) ?\")) + (goto-char (point-max)) + (skip-chars-backward " \t") + (if (setq quoted (and quoted + (> (point) (1+ start)) + (eq (char-before) ?\"))) + (progn + (backward-char) + (setq start (1+ start) + end (point-marker))) + (setq end (point-marker))) + (goto-char start) + (while (search-forward "\"" end t) + (when (prog2 + (backward-char) + (zerop (% (skip-chars-backward "\\\\") 2)) + (goto-char (match-beginning 0))) + (insert "\\")) + (forward-char)) + (when (and (not quoted) + (progn + (goto-char start) + (re-search-forward + (concat "[" ietf-drums-tspecials "]") + end t))) + (goto-char start) + (insert "\"") + (goto-char end) + (insert "\"")) + (set-marker end nil))) (goto-char (point-max))) (when (and (mm-multibyte-p) mail-parse-charset @@ -539,59 +891,79 @@ The buffer may be narrowed." mail-parse-charset (not (eq mail-parse-charset 'us-ascii)) (not (eq mail-parse-charset 'gnus-decoded))) - (mm-decode-coding-region-safely b (point-max) mail-parse-charset)))))) + (mm-decode-coding-region b (point-max) mail-parse-charset)))))) (defun rfc2047-decode-string (string) "Decode the quoted-printable-encoded STRING and return the results." (let ((m (mm-multibyte-p))) (if (string-match "=\\?" string) (with-temp-buffer + ;; Fixme: This logic is wrong, but seems to be required by + ;; Gnus summary buffer generation. The value of `m' depends + ;; on the current buffer, not global multibyteness or that + ;; of the string. Also the string returned should always be + ;; multibyte in a multibyte session, i.e. the buffer should + ;; be multibyte before `buffer-string' is called. (when m (mm-enable-multibyte)) (insert string) (inline (rfc2047-decode-region (point-min) (point-max))) (buffer-string)) + ;; Fixme: As above, `m' here is inappropriate. (if (and m mail-parse-charset (not (eq mail-parse-charset 'us-ascii)) (not (eq mail-parse-charset 'gnus-decoded))) - (let* ((decoded (mm-decode-coding-string string mail-parse-charset)) - (charsets (find-charset-string decoded))) - (if (or (memq 'eight-bit-control charsets) - (memq 'eight-bit-graphic charsets)) - (mm-decode-coding-string string 'undecided) - decoded)) - string)))) + ;; `decode-coding-string' in Emacs offers a third optional + ;; arg NOCOPY to avoid consing a new string if the decoding + ;; is "trivial". Unfortunately it currently doesn't + ;; consider anything else than a `nil' coding system + ;; trivial. + ;; `rfc2047-decode-string' is called multiple times for each + ;; article during summary buffer generation, and we really + ;; want to avoid unnecessary consing. So we bypass + ;; `decode-coding-string' if the string is purely ASCII. + (if (and (fboundp 'detect-coding-string) + ;; string is purely ASCII + (eq (detect-coding-string string t) 'undecided)) + string + (mm-decode-coding-string string mail-parse-charset)) + (mm-string-as-multibyte string))))) (defun rfc2047-parse-and-decode (word) "Decode WORD and return it if it is an encoded word. -Return WORD if not." +Return WORD if it is not not an encoded word or if the charset isn't +decodable." (if (not (string-match rfc2047-encoded-word-regexp word)) word (or (condition-case nil (rfc2047-decode (match-string 1 word) - (upcase (match-string 2 word)) + (string-to-char (match-string 2 word)) (match-string 3 word)) (error word)) - word))) + word))) ; un-decodable (defun rfc2047-pad-base64 (string) "Pad STRING to quartets." ;; Be more liberal to accept buggy base64 strings. If ;; base64-decode-string accepts buggy strings, this function could ;; be aliased to identity. - (case (mod (length string) 4) - (0 string) - (1 string) ;; Error, don't pad it. - (2 (concat string "==")) - (3 (concat string "=")))) + (if (= 0 (mod (length string) 4)) + string + (when (string-match "=+$" string) + (setq string (substring string 0 (match-beginning 0)))) + (case (mod (length string) 4) + (0 string) + (1 string) ;; Error, don't pad it. + (2 (concat string "==")) + (3 (concat string "="))))) (defun rfc2047-decode (charset encoding string) "Decode STRING from the given MIME CHARSET in the given ENCODING. -Valid ENCODINGs are \"B\" and \"Q\". +Valid ENCODINGs are the characters \"B\" and \"Q\". If your Emacs implementation can't decode CHARSET, return nil." (if (stringp charset) (setq charset (intern (downcase charset)))) @@ -606,22 +978,20 @@ If your Emacs implementation can't decode CHARSET, return nil." (memq 'gnus-unknown mail-parse-ignored-charsets)) (setq cs (mm-charset-to-coding-system mail-parse-charset))) (when cs - (when (and (eq cs 'ascii) - mail-parse-charset) - (setq cs mail-parse-charset)) - (mm-with-unibyte-current-buffer-mule4 - ;; In Emacs Mule 4, decoding UTF-8 should be in unibyte mode. - (mm-decode-coding-string - (cond - ((equal "B" encoding) - (base64-decode-string - (rfc2047-pad-base64 string))) - ((equal "Q" encoding) - (quoted-printable-decode-string - (mm-replace-chars-in-string string ?_ ? ))) - (t (error "Invalid encoding: %s" encoding))) - cs))))) + (when (eq cs 'ascii) + (setq cs (or mail-parse-charset 'raw-text))) + (mm-decode-coding-string + (cond + ((char-equal ?B encoding) + (base64-decode-string + (rfc2047-pad-base64 string))) + ((char-equal ?Q encoding) + (quoted-printable-decode-string + (mm-subst-char-in-string ?_ ? string t))) + (t (error "Invalid encoding: %c" encoding))) + cs)))) (provide 'rfc2047) +;;; arch-tag: a07fe3d4-22b5-4c4a-bd89-b1f82d5d36f6 ;;; rfc2047.el ends here