X-Git-Url: http://cgit.sxemacs.org/?a=blobdiff_plain;f=lisp%2Frfc2047.el;h=b0454953fa0dbe15c59947e702cb77f1175c3152;hb=45cfc67cc4116983fe3e486032d55973a1d532f0;hp=87de840e1af5a33f824595272d872712e7ff1d42;hpb=667db9eb4cff382d53625a5daabbba857cdb4616;p=gnus diff --git a/lisp/rfc2047.el b/lisp/rfc2047.el index 87de840e1..b0454953f 100644 --- a/lisp/rfc2047.el +++ b/lisp/rfc2047.el @@ -29,24 +29,7 @@ (eval-when-compile (require 'cl) - (defvar message-posting-charset) - (unless (fboundp 'with-syntax-table) ; not in Emacs 20 - (defmacro with-syntax-table (table &rest body) - "Evaluate BODY with syntax table of current buffer set to TABLE. -The syntax table of the current buffer is saved, BODY is evaluated, and the -saved table is restored, even in case of an abnormal exit. -Value is what BODY returns." - (let ((old-table (make-symbol "table")) - (old-buffer (make-symbol "buffer"))) - `(let ((,old-table (syntax-table)) - (,old-buffer (current-buffer))) - (unwind-protect - (progn - (set-syntax-table ,table) - ,@body) - (save-current-buffer - (set-buffer ,old-buffer) - (set-syntax-table ,old-table)))))))) + (defvar message-posting-charset)) (require 'qp) (require 'mm-util) @@ -55,24 +38,12 @@ Value is what BODY returns." (require 'base64) (autoload 'mm-body-7-or-8 "mm-bodies") -(eval-and-compile - ;; Avoid gnus-util for mm- code. - (defalias 'rfc2047-point-at-bol - (if (fboundp 'point-at-bol) - 'point-at-bol - 'line-beginning-position)) - - (defalias 'rfc2047-point-at-eol - (if (fboundp 'point-at-eol) - 'point-at-eol - 'line-end-position))) - (defvar rfc2047-header-encoding-alist '(("Newsgroups" . nil) ("Followup-To" . nil) ("Message-ID" . nil) - ("\\(Resent-\\)?\\(From\\|Cc\\|To\\|Bcc\\|Reply-To\\|Sender\\)" . - address-mime) + ("\\(Resent-\\)?\\(From\\|Cc\\|To\\|Bcc\\|Reply-To\\|Sender\ +\\|Mail-Followup-To\\|Mail-Copies-To\\|Approved\\)" . address-mime) (t . mime)) "*Header/encoding method alist. The list is traversed sequentially. The keys can either be @@ -110,7 +81,8 @@ The values can be: (cn-gb-2312 . B) (euc-kr . B) (iso-2022-jp-2 . B) - (iso-2022-int-1 . B)) + (iso-2022-int-1 . B) + (viscii . Q)) "Alist of MIME charsets to RFC2047 encodings. Valid encodings are nil, `Q' and `B'. These indicate binary (no) encoding, quoted-printable and base64 respectively.") @@ -121,10 +93,32 @@ quoted-printable and base64 respectively.") (nil . ignore)) "Alist of RFC2047 encodings to encoding functions.") +(defvar rfc2047-encode-encoded-words t + "Whether encoded words should be encoded again.") + ;;; ;;; Functions for encoding RFC2047 messages ;;; +(defun rfc2047-qp-or-base64 () + "Return the type with which to encode the buffer. +This is either `base64' or `quoted-printable'." + (save-excursion + (let ((limit (min (point-max) (+ 2000 (point-min)))) + (n8bit 0)) + (goto-char (point-min)) + (skip-chars-forward "\x20-\x7f\r\n\t" limit) + (while (< (point) limit) + (incf n8bit) + (forward-char 1) + (skip-chars-forward "\x20-\x7f\r\n\t" limit)) + (if (or (< (* 6 n8bit) (- limit (point-min))) + ;; Don't base64, say, a short line with a single + ;; non-ASCII char when splitting parts by charset. + (= n8bit 1)) + 'quoted-printable + 'base64)))) + (defun rfc2047-narrow-to-field () "Narrow the buffer to the header on the current line." (beginning-of-line) @@ -133,9 +127,7 @@ quoted-printable and base64 respectively.") (progn (forward-line 1) (if (re-search-forward "^[^ \n\t]" nil t) - (progn - (beginning-of-line) - (point)) + (point-at-bol) (point-max)))) (goto-char (point-min))) @@ -145,7 +137,7 @@ quoted-printable and base64 respectively.") (save-restriction (rfc2047-narrow-to-field) (re-search-forward ":[ \t\n]*" nil t) - (buffer-substring (point) (point-max))))) + (buffer-substring-no-properties (point) (point-max))))) (defvar rfc2047-encoding-type 'address-mime "The type of encoding done by `rfc2047-encode-region'. @@ -246,8 +238,11 @@ The buffer may be narrowed." (require 'message) ; for message-posting-charset (let ((charsets (mm-find-mime-charset-region (point-min) (point-max)))) - (and charsets - (not (equal charsets (list (car message-posting-charset))))))) + (goto-char (point-min)) + (or (and rfc2047-encode-encoded-words + (search-forward "=?" nil t)) + (and charsets + (not (equal charsets (list (car message-posting-charset)))))))) ;; Use this syntax table when parsing into regions that may need ;; encoding. Double quotes are string delimiters, backslash is @@ -295,18 +290,23 @@ Dynamically bind `rfc2047-encoding-type' to change that." ;; is relevant for instance in Subject headers with `Re:' for ;; interoperability with non-MIME clients, and we might as ;; well avoid the tail too. - (progn + (let ((encodable-regexp + (if rfc2047-encode-encoded-words + "[^\000-\177]\\|=\\?" + "[^\000-\177]"))) (goto-char (point-min)) ;; Does it need encoding? - (skip-chars-forward "\000-\177") + (re-search-forward encodable-regexp (point-max) 'move) (unless (eobp) (skip-chars-backward "^ \n") ; beginning of space-delimited word - (rfc2047-encode (point) (progn - (goto-char e) - (skip-chars-backward "\000-\177") - (skip-chars-forward "^ \n") - ;; end of space-delimited word - (point))))) + (rfc2047-encode + (point) + (progn + (goto-char e) + (re-search-backward encodable-regexp (point-max) 'move) + (skip-chars-forward "^ \n") + ;; end of space-delimited word + (point))))) ;; `address-mime' case -- take care of quoted words, comments. (with-syntax-table rfc2047-syntax-table (let ((start) ; start of current token @@ -379,8 +379,8 @@ Dynamically bind `rfc2047-encoding-type' to change that." (rfc2047-encode start end) (setq last-encoded t))))) (error - (message "Invalid data for rfc2047 encoding: %s" - (buffer-substring b e))))))) + (error "Invalid data for rfc2047 encoding: %s" + (buffer-substring b e))))))) (rfc2047-fold-region b (point)))) (defun rfc2047-encode-string (string) @@ -396,7 +396,7 @@ By default, the string is treated as containing addresses (see "Encode the word(s) in the region B to E. By default, the region is treated as containing addresses (see `rfc2047-encoding-type')." - (let* ((mime-charset (mm-find-mime-charset-region b e)) + (let* ((mime-charset (or (mm-find-mime-charset-region b e) (list 'us-ascii))) (cs (if (> (length mime-charset) 1) ;; Fixme: Instead of this, try to break region into ;; parts that can be encoded separately. @@ -406,14 +406,36 @@ By default, the region is treated as containing addresses (see (mm-charset-to-coding-system mime-charset))) ;; Fixme: Better, calculate the number of non-ASCII ;; characters, at least for 8-bit charsets. - (encoding (if (assq mime-charset - rfc2047-charset-encoding-alist) - (cdr (assq mime-charset + (encoding (or (cdr (assq mime-charset rfc2047-charset-encoding-alist)) - 'B)) + ;; For the charsets that don't have a preferred + ;; encoding, choose the one that's shorter. + (save-restriction + (narrow-to-region b e) + (if (eq (rfc2047-qp-or-base64) 'base64) + 'B + 'Q)))) (start (concat "=?" (downcase (symbol-name mime-charset)) "?" - (downcase (symbol-name encoding)) "?")) + (upcase (symbol-name encoding)) "?")) + (factor (case mime-charset + ((iso-8859-5 iso-8859-7 iso-8859-8 koi8-r) 1) + ((big5 gb2312 euc-kr) 2) + (utf-8 4) + (t 8))) + (pre (- b (save-restriction + (widen) + (point-at-bol)))) + ;; encoded-words must not be longer than 75 characters, + ;; including charset, encoding etc. This leaves us with + ;; 75 - (length start) - 2 - 2 characters. The last 2 is for + ;; possible base64 padding. In the worst case (iso-2022-*) + ;; each character expands to 8 bytes which is expanded by a + ;; factor of 4/3 by base64 encoding. + (length (floor (- 75 (length start) 4) (* factor (/ 4.0 3.0)))) + ;; Limit line length to 76 characters. + (length1 (max 1 (floor (- 76 (length start) 4 pre) + (* factor (/ 4.0 3.0))))) (first t)) (if mime-charset (save-restriction @@ -422,9 +444,14 @@ By default, the region is treated as containing addresses (see ;; break into lines before encoding (goto-char (point-min)) (while (not (eobp)) - (goto-char (min (point-max) (+ 15 (point)))) + (if first + (progn + (goto-char (min (point-max) (+ length1 (point)))) + (setq first nil)) + (goto-char (min (point-max) (+ length (point))))) (unless (eobp) - (insert ?\n)))) + (insert ?\n))) + (setq first t)) (if (and (mm-multibyte-p) (mm-coding-system-p cs)) (mm-encode-coding-region (point-min) (point-max) cs)) @@ -457,7 +484,7 @@ By default, the region is treated as containing addresses (see (first t) (bol (save-restriction (widen) - (rfc2047-point-at-bol)))) + (point-at-bol)))) (while (not (eobp)) (when (and (or break qword-break) (> (- (point) bol) 76)) @@ -494,7 +521,9 @@ By default, the region is treated as containing addresses (see (if (eq (char-after) ?=) (forward-char 1) (skip-chars-forward "^ \t\n\r=")) - (setq qword-break (point)) + ;; Don't break at the start of the field. + (unless (= (point) b) + (setq qword-break (point))) (skip-chars-forward "^ \t\n\r"))) (t (skip-chars-forward "^ \t\n\r")))) @@ -526,18 +555,18 @@ By default, the region is treated as containing addresses (see (goto-char (point-min)) (let ((bol (save-restriction (widen) - (rfc2047-point-at-bol))) - (eol (rfc2047-point-at-eol))) + (point-at-bol))) + (eol (point-at-eol))) (forward-line 1) (while (not (eobp)) (if (and (looking-at "[ \t]") - (< (- (rfc2047-point-at-eol) bol) 76)) + (< (- (point-at-eol) bol) 76)) (delete-region eol (progn (goto-char eol) (skip-chars-forward "\r\n") (point))) - (setq bol (rfc2047-point-at-bol))) - (setq eol (rfc2047-point-at-eol)) + (setq bol (point-at-bol))) + (setq eol (point-at-eol)) (forward-line 1))))) (defun rfc2047-b-encode-region (b e) @@ -557,13 +586,18 @@ By default, the region is treated as containing addresses (see (narrow-to-region (goto-char b) e) (let ((bol (save-restriction (widen) - (rfc2047-point-at-bol)))) + (point-at-bol)))) (quoted-printable-encode-region b e nil ;; = (\075), _ (\137), ? (\077) are used in the encoded word. ;; Avoid using 8bit characters. - ;; Equivalent to "^\000-\007\011\013\015-\037\200-\377=_?" - "\010\012\014\040-\074\076\100-\136\140-\177") + ;; This list excludes `especials' (see the RFC2047 syntax), + ;; meaning that some characters in non-structured fields will + ;; get encoded when they con't need to be. The following is + ;; what it used to be. +;;; ;; Equivalent to "^\000-\007\011\013\015-\037\200-\377=_?" +;;; "\010\012\014\040-\074\076\100-\136\140-\177") + "-\b\n\f !#-'*+0-9A-Z\\^`-~\d") (subst-char-in-region (point-min) (point-max) ? ?_) ;; The size of QP encapsulation is about 20, so set limit to ;; 56=76-20. @@ -584,8 +618,8 @@ By default, the region is treated as containing addresses (see (eval-and-compile (defconst rfc2047-encoded-word-regexp - "=\\?\\([^][\000-\040()<>@,\;:\\\"/?.=]+\\)\\?\\(B\\|Q\\)\ -\\?\\([!->@-~ +]*\\)\\?=")) + "=\\?\\([^][\000-\040()<>@,\;:*\\\"/?.=]+\\)\\(?:\\*[^?]+\\)?\ +\\?\\(B\\|Q\\)\\?\\([!->@-~ ]*\\)\\?=")) ;; Fixme: This should decode in place, not cons intermediate strings. ;; Also check whether it needs to worry about delimiting fields like @@ -664,7 +698,20 @@ By default, the region is treated as containing addresses (see mail-parse-charset (not (eq mail-parse-charset 'us-ascii)) (not (eq mail-parse-charset 'gnus-decoded))) - (mm-decode-coding-string string mail-parse-charset) + ;; `decode-coding-string' in Emacs offers a third optional + ;; arg NOCOPY to avoid consing a new string if the decoding + ;; is "trivial". Unfortunately it currently doesn't + ;; consider anything else than a `nil' coding system + ;; trivial. + ;; `rfc2047-decode-string' is called multiple times for each + ;; article during summary buffer generation, and we really + ;; want to avoid unnecessary consing. So we bypass + ;; `decode-coding-string' if the string is purely ASCII. + (if (and (fboundp 'detect-coding-string) + ;; string is purely ASCII + (eq (detect-coding-string string t) 'undecided)) + string + (mm-decode-coding-string string mail-parse-charset)) (mm-string-as-multibyte string))))) (defun rfc2047-parse-and-decode (word) @@ -677,7 +724,7 @@ decodable." (condition-case nil (rfc2047-decode (match-string 1 word) - (upcase (match-string 2 word)) + (string-to-char (match-string 2 word)) (match-string 3 word)) (error word)) word))) ; un-decodable @@ -687,15 +734,19 @@ decodable." ;; Be more liberal to accept buggy base64 strings. If ;; base64-decode-string accepts buggy strings, this function could ;; be aliased to identity. - (case (mod (length string) 4) - (0 string) - (1 string) ;; Error, don't pad it. - (2 (concat string "==")) - (3 (concat string "=")))) + (if (= 0 (mod (length string) 4)) + string + (when (string-match "=+$" string) + (setq string (substring string 0 (match-beginning 0)))) + (case (mod (length string) 4) + (0 string) + (1 string) ;; Error, don't pad it. + (2 (concat string "==")) + (3 (concat string "="))))) (defun rfc2047-decode (charset encoding string) "Decode STRING from the given MIME CHARSET in the given ENCODING. -Valid ENCODINGs are \"B\" and \"Q\". +Valid ENCODINGs are the characters \"B\" and \"Q\". If your Emacs implementation can't decode CHARSET, return nil." (if (stringp charset) (setq charset (intern (downcase charset)))) @@ -715,13 +766,13 @@ If your Emacs implementation can't decode CHARSET, return nil." (setq cs mail-parse-charset)) (mm-decode-coding-string (cond - ((equal "B" encoding) + ((char-equal ?B encoding) (base64-decode-string (rfc2047-pad-base64 string))) - ((equal "Q" encoding) + ((char-equal ?Q encoding) (quoted-printable-decode-string - (mm-replace-chars-in-string string ?_ ? ))) - (t (error "Invalid encoding: %s" encoding))) + (mm-subst-char-in-string ?_ ? string t))) + (t (error "Invalid encoding: %c" encoding))) cs)))) (provide 'rfc2047)