X-Git-Url: http://cgit.sxemacs.org/?a=blobdiff_plain;f=lisp%2Frfc2047.el;h=ebd8a4a41bcb5a350be5f0480d3e8c04fb1123df;hb=0ed5b1813c561b0290b3ebbe2e0eb9bba6156076;hp=a282d6a3e489371019d620a3a086897b0814dae4;hpb=8804a180e8e3ba9e0444d3ef7eab9fc046a0e7df;p=gnus diff --git a/lisp/rfc2047.el b/lisp/rfc2047.el index a282d6a3e..ebd8a4a41 100644 --- a/lisp/rfc2047.el +++ b/lisp/rfc2047.el @@ -38,23 +38,12 @@ (require 'base64) (autoload 'mm-body-7-or-8 "mm-bodies") -;; Avoid gnus-util for mm- code. -(defalias 'rfc2047-point-at-bol - (if (fboundp 'point-at-bol) - 'point-at-bol - 'line-beginning-position)) - -(defalias 'rfc2047-point-at-eol - (if (fboundp 'point-at-eol) - 'point-at-eol - 'line-end-position)) - (defvar rfc2047-header-encoding-alist '(("Newsgroups" . nil) ("Followup-To" . nil) ("Message-ID" . nil) - ("\\(Resent-\\)?\\(From\\|Cc\\|To\\|Bcc\\|Reply-To\\|Sender\\)" . - address-mime) + ("\\(Resent-\\)?\\(From\\|Cc\\|To\\|Bcc\\|Reply-To\\|Sender\ +\\|Mail-Followup-To\\|Mail-Copies-To\\|Approved\\)" . address-mime) (t . mime)) "*Header/encoding method alist. The list is traversed sequentially. The keys can either be @@ -92,7 +81,8 @@ The values can be: (cn-gb-2312 . B) (euc-kr . B) (iso-2022-jp-2 . B) - (iso-2022-int-1 . B)) + (iso-2022-int-1 . B) + (viscii . Q)) "Alist of MIME charsets to RFC2047 encodings. Valid encodings are nil, `Q' and `B'. These indicate binary (no) encoding, quoted-printable and base64 respectively.") @@ -103,19 +93,29 @@ quoted-printable and base64 respectively.") (nil . ignore)) "Alist of RFC2047 encodings to encoding functions.") -(defvar rfc2047-q-encoding-alist - '(("\\(Resent-\\)?\\(From\\|Cc\\|To\\|Bcc\\|Reply-To\\|Sender\\):" - . "-A-Za-z0-9!*+/" ) - ;; = (\075), _ (\137), ? (\077) are used in the encoded word. - ;; Avoid using 8bit characters. - ;; Equivalent to "^\000-\007\011\013\015-\037\200-\377=_?" - ("." . "\010\012\014\040-\074\076\100-\136\140-\177")) - "Alist of header regexps and valid Q characters.") - ;;; ;;; Functions for encoding RFC2047 messages ;;; +(defun rfc2047-qp-or-base64 () + "Return the type with which to encode the buffer. +This is either `base64' or `quoted-printable'." + (save-excursion + (let ((limit (min (point-max) (+ 2000 (point-min)))) + (n8bit 0)) + (goto-char (point-min)) + (skip-chars-forward "\x20-\x7f\r\n\t" limit) + (while (< (point) limit) + (incf n8bit) + (forward-char 1) + (skip-chars-forward "\x20-\x7f\r\n\t" limit)) + (if (or (< (* 6 n8bit) (- limit (point-min))) + ;; Don't base64, say, a short line with a single + ;; non-ASCII char when splitting parts by charset. + (= n8bit 1)) + 'quoted-printable + 'base64)))) + (defun rfc2047-narrow-to-field () "Narrow the buffer to the header on the current line." (beginning-of-line) @@ -124,9 +124,7 @@ quoted-printable and base64 respectively.") (progn (forward-line 1) (if (re-search-forward "^[^ \n\t]" nil t) - (progn - (beginning-of-line) - (point)) + (point-at-bol) (point-max)))) (goto-char (point-min))) @@ -189,7 +187,7 @@ Should be called narrowed to the head of the message." ((eq method 'address-mime) (rfc2047-encode-region (point) (point-max))) ((eq method 'mime) - (let (rfc2047-encoding-type) + (let ((rfc2047-encoding-type 'mime)) (rfc2047-encode-region (point) (point-max)))) ((eq method 'default) (if (and (featurep 'mule) @@ -237,7 +235,8 @@ The buffer may be narrowed." (require 'message) ; for message-posting-charset (let ((charsets (mm-find-mime-charset-region (point-min) (point-max)))) - (and charsets (not (equal charsets (list message-posting-charset)))))) + (and charsets + (not (equal charsets (list (car message-posting-charset))))))) ;; Use this syntax table when parsing into regions that may need ;; encoding. Double quotes are string delimiters, backslash is @@ -246,11 +245,19 @@ The buffer may be narrowed." ;; skip to the end of regions appropriately. Nb. ietf-drums does ;; things differently. (defconst rfc2047-syntax-table - ;; This is what we should do, but XEmacs doesn't support the optional - ;; arg of `make-syntax-table': -;; (let ((table (make-char-table 'syntax-table '(2)))) - (let ((table (make-char-table 'syntax-table))) - (map-char-table (lambda (k v) (aset table k '(2))) table) + ;; (make-char-table 'syntax-table '(2)) only works in Emacs. + (let ((table (make-syntax-table))) + ;; The following is done to work for setting all elements of the table + ;; in Emacs 21 and 22 and XEmacs; it appears to be the cleanest way. + ;; Play safe and don't assume the form of the word syntax entry -- + ;; copy it from ?a. + (if (fboundp 'set-char-table-range) ; Emacs + (funcall (intern "set-char-table-range") + table t (aref (standard-syntax-table) ?a)) + (if (fboundp 'put-char-table) + (if (fboundp 'get-char-table) ; warning avoidance + (put-char-table t (get-char-table ?a (standard-syntax-table)) + table)))) (modify-syntax-entry ?\\ "\\" table) (modify-syntax-entry ?\" "\"" table) (modify-syntax-entry ?\( "." table) @@ -272,13 +279,23 @@ Dynamically bind `rfc2047-encoding-type' to change that." (save-restriction (narrow-to-region b e) (if (eq 'mime rfc2047-encoding-type) - ;; Simple case -- treat as single word. + ;; Simple case. Treat as single word after any initial ASCII + ;; part and before any tailing ASCII part. The leading ASCII + ;; is relevant for instance in Subject headers with `Re:' for + ;; interoperability with non-MIME clients, and we might as + ;; well avoid the tail too. (progn (goto-char (point-min)) ;; Does it need encoding? - (skip-chars-forward "\000-\177" e) + (skip-chars-forward "\000-\177") (unless (eobp) - (rfc2047-encode b e))) + (skip-chars-backward "^ \n") ; beginning of space-delimited word + (rfc2047-encode (point) (progn + (goto-char e) + (skip-chars-backward "\000-\177") + (skip-chars-forward "^ \n") + ;; end of space-delimited word + (point))))) ;; `address-mime' case -- take care of quoted words, comments. (with-syntax-table rfc2047-syntax-table (let ((start) ; start of current token @@ -287,14 +304,14 @@ Dynamically bind `rfc2047-encoding-type' to change that." ;; token, either immediately or separated by space. last-encoded) (goto-char (point-min)) - (condition-case nil ; in case of unbalanced quotes + (condition-case nil ; in case of unbalanced quotes ;; Look for rfc2822-style: sequences of atoms, quoted ;; strings, specials, whitespace. (Specials mustn't be ;; encoded.) (while (not (eobp)) (setq start (point)) ;; Skip whitespace. - (unless (= 0 (skip-chars-forward " \t")) + (unless (= 0 (skip-chars-forward " \t\n")) (setq start (point))) (cond ((not (char-after))) ; eob @@ -350,14 +367,15 @@ Dynamically bind `rfc2047-encoding-type' to change that." end (1+ end))) (rfc2047-encode start end) (setq last-encoded t))))) - (error (error "Invalid data for rfc2047 encoding: %s" - (buffer-substring b e))))))) + (error + (error "Invalid data for rfc2047 encoding: %s" + (buffer-substring b e))))))) (rfc2047-fold-region b (point)))) (defun rfc2047-encode-string (string) "Encode words in STRING. By default, the string is treated as containing addresses (see -`rfc2047-special-chars')." +`rfc2047-encoding-type')." (with-temp-buffer (insert string) (rfc2047-encode-region (point-min) (point-max)) @@ -366,7 +384,7 @@ By default, the string is treated as containing addresses (see (defun rfc2047-encode (b e) "Encode the word(s) in the region B to E. By default, the region is treated as containing addresses (see -`rfc2047-special-chars')." +`rfc2047-encoding-type')." (let* ((mime-charset (mm-find-mime-charset-region b e)) (cs (if (> (length mime-charset) 1) ;; Fixme: Instead of this, try to break region into @@ -377,14 +395,36 @@ By default, the region is treated as containing addresses (see (mm-charset-to-coding-system mime-charset))) ;; Fixme: Better, calculate the number of non-ASCII ;; characters, at least for 8-bit charsets. - (encoding (if (assq mime-charset - rfc2047-charset-encoding-alist) - (cdr (assq mime-charset + (encoding (or (cdr (assq mime-charset rfc2047-charset-encoding-alist)) - 'B)) + ;; For the charsets that don't have a preferred + ;; encoding, choose the one that's shorter. + (save-restriction + (narrow-to-region b e) + (if (eq (rfc2047-qp-or-base64) 'base64) + 'B + 'Q)))) (start (concat "=?" (downcase (symbol-name mime-charset)) "?" (downcase (symbol-name encoding)) "?")) + (factor (case mime-charset + ((iso-8859-5 iso-8859-7 iso-8859-8 koi8-r) 1) + ((big5 gb2312 euc-kr) 2) + (utf-8 4) + (t 8))) + (pre (- b (save-restriction + (widen) + (point-at-bol)))) + ;; encoded-words must not be longer than 75 characters, + ;; including charset, encoding etc. This leaves us with + ;; 75 - (length start) - 2 - 2 characters. The last 2 is for + ;; possible base64 padding. In the worst case (iso-2022-*) + ;; each character expands to 8 bytes which is expanded by a + ;; factor of 4/3 by base64 encoding. + (length (floor (- 75 (length start) 4) (* factor (/ 4.0 3.0)))) + ;; Limit line length to 76 characters. + (length1 (max 1 (floor (- 76 (length start) 4 pre) + (* factor (/ 4.0 3.0))))) (first t)) (if mime-charset (save-restriction @@ -393,9 +433,14 @@ By default, the region is treated as containing addresses (see ;; break into lines before encoding (goto-char (point-min)) (while (not (eobp)) - (goto-char (min (point-max) (+ 15 (point)))) + (if first + (progn + (goto-char (min (point-max) (+ length1 (point)))) + (setq first nil)) + (goto-char (min (point-max) (+ length (point))))) (unless (eobp) - (insert ?\n)))) + (insert ?\n))) + (setq first t)) (if (and (mm-multibyte-p) (mm-coding-system-p cs)) (mm-encode-coding-region (point-min) (point-max) cs)) @@ -428,7 +473,7 @@ By default, the region is treated as containing addresses (see (first t) (bol (save-restriction (widen) - (rfc2047-point-at-bol)))) + (point-at-bol)))) (while (not (eobp)) (when (and (or break qword-break) (> (- (point) bol) 76)) @@ -465,7 +510,9 @@ By default, the region is treated as containing addresses (see (if (eq (char-after) ?=) (forward-char 1) (skip-chars-forward "^ \t\n\r=")) - (setq qword-break (point)) + ;; Don't break at the start of the field. + (unless (= (point) b) + (setq qword-break (point))) (skip-chars-forward "^ \t\n\r"))) (t (skip-chars-forward "^ \t\n\r")))) @@ -497,19 +544,18 @@ By default, the region is treated as containing addresses (see (goto-char (point-min)) (let ((bol (save-restriction (widen) - (rfc2047-point-at-bol))) - (eol (rfc2047-point-at-eol)) - leading) + (point-at-bol))) + (eol (point-at-eol))) (forward-line 1) (while (not (eobp)) (if (and (looking-at "[ \t]") - (< (- (rfc2047-point-at-eol) bol) 76)) + (< (- (point-at-eol) bol) 76)) (delete-region eol (progn (goto-char eol) (skip-chars-forward "\r\n") (point))) - (setq bol (rfc2047-point-at-bol))) - (setq eol (rfc2047-point-at-eol)) + (setq bol (point-at-bol))) + (setq eol (point-at-eol)) (forward-line 1))))) (defun rfc2047-b-encode-region (b e) @@ -527,16 +573,21 @@ By default, the region is treated as containing addresses (see (save-excursion (save-restriction (narrow-to-region (goto-char b) e) - (let ((alist rfc2047-q-encoding-alist) - (bol (save-restriction + (let ((bol (save-restriction (widen) - (rfc2047-point-at-bol)))) - (while alist - (when (looking-at (caar alist)) - (quoted-printable-encode-region b e nil (cdar alist)) - (subst-char-in-region (point-min) (point-max) ? ?_) - (setq alist nil)) - (pop alist)) + (point-at-bol)))) + (quoted-printable-encode-region + b e nil + ;; = (\075), _ (\137), ? (\077) are used in the encoded word. + ;; Avoid using 8bit characters. + ;; This list excludes `especials' (see the RFC2047 syntax), + ;; meaning that some characters in non-structured fields will + ;; get encoded when they con't need to be. The following is + ;; what it used to be. +;;; ;; Equivalent to "^\000-\007\011\013\015-\037\200-\377=_?" +;;; "\010\012\014\040-\074\076\100-\136\140-\177") + "-\b\n\f !#-'*+0-9A-Z\\^`-~\d") + (subst-char-in-region (point-min) (point-max) ? ?_) ;; The size of QP encapsulation is about 20, so set limit to ;; 56=76-20. (unless (< (- (point-max) (point-min)) 56) @@ -556,13 +607,19 @@ By default, the region is treated as containing addresses (see (eval-and-compile (defconst rfc2047-encoded-word-regexp - "=\\?\\([^][\000-\040()<>@,\;:\\\"/?.=]+\\)\\?\\(B\\|Q\\)\ -\\?\\([!->@-~ +]*\\)\\?=")) + "=\\?\\([^][\000-\040()<>@,\;:*\\\"/?.=]+\\)(?:\\*[^?]+\\)?\ +\\?\\(B\\|Q\\)\\?\\([!->@-~ +]*\\)\\?=")) ;; Fixme: This should decode in place, not cons intermediate strings. ;; Also check whether it needs to worry about delimiting fields like ;; encoding. +;; In fact it's reported that (invalid) encoding of mailboxes in +;; addr-specs is in use, so delimiting fields might help. Probably +;; not decoding a word which isn't properly delimited is good enough +;; and worthwhile (is it more correct or not?), e.g. something like +;; `=?iso-8859-1?q?foo?=@'. + (defun rfc2047-decode-region (start end) "Decode MIME-encoded words in region between START and END." (interactive "r") @@ -613,18 +670,38 @@ By default, the region is treated as containing addresses (see (let ((m (mm-multibyte-p))) (if (string-match "=\\?" string) (with-temp-buffer + ;; Fixme: This logic is wrong, but seems to be required by + ;; Gnus summary buffer generation. The value of `m' depends + ;; on the current buffer, not global multibyteness or that + ;; of the string. Also the string returned should always be + ;; multibyte in a multibyte session, i.e. the buffer should + ;; be multibyte before `buffer-string' is called. (when m (mm-enable-multibyte)) (insert string) (inline (rfc2047-decode-region (point-min) (point-max))) - (mm-enable-multibyte) - (buffer-string))) - (if (and mail-parse-charset - (not (eq mail-parse-charset 'us-ascii)) - (not (eq mail-parse-charset 'gnus-decoded))) - (mm-decode-coding-string string mail-parse-charset) - (mm-string-as-multibyte string)))) + (buffer-string)) + ;; Fixme: As above, `m' here is inappropriate. + (if (and m + mail-parse-charset + (not (eq mail-parse-charset 'us-ascii)) + (not (eq mail-parse-charset 'gnus-decoded))) + ;; `decode-coding-string' in Emacs offers a third optional + ;; arg NOCOPY to avoid consing a new string if the decoding + ;; is "trivial". Unfortunately it currently doesn't + ;; consider anything else than a `nil' coding system + ;; trivial. + ;; `rfc2047-decode-string' is called multiple times for each + ;; article during summary buffer generation, and we really + ;; want to avoid unnecessary consing. So we bypass + ;; `decode-coding-string' if the string is purely ASCII. + (if (and (fboundp 'detect-coding-string) + ;; string is purely ASCII + (eq (detect-coding-string string t) 'undecided)) + string + (mm-decode-coding-string string mail-parse-charset)) + (mm-string-as-multibyte string))))) (defun rfc2047-parse-and-decode (word) "Decode WORD and return it if it is an encoded word. @@ -632,27 +709,33 @@ Return WORD if it is not not an encoded word or if the charset isn't decodable." (if (not (string-match rfc2047-encoded-word-regexp word)) word - (condition-case nil - (rfc2047-decode - (match-string 1 word) - (upcase (match-string 2 word)) - (match-string 3 word)) - (error word)))) + (or + (condition-case nil + (rfc2047-decode + (match-string 1 word) + (string-to-char (match-string 2 word)) + (match-string 3 word)) + (error word)) + word))) ; un-decodable (defun rfc2047-pad-base64 (string) "Pad STRING to quartets." ;; Be more liberal to accept buggy base64 strings. If ;; base64-decode-string accepts buggy strings, this function could ;; be aliased to identity. - (case (mod (length string) 4) - (0 string) - (1 string) ;; Error, don't pad it. - (2 (concat string "==")) - (3 (concat string "=")))) + (if (= 0 (mod (length string) 4)) + string + (when (string-match "=+$" string) + (setq string (substring string 0 (match-beginning 0)))) + (case (mod (length string) 4) + (0 string) + (1 string) ;; Error, don't pad it. + (2 (concat string "==")) + (3 (concat string "="))))) (defun rfc2047-decode (charset encoding string) "Decode STRING from the given MIME CHARSET in the given ENCODING. -Valid ENCODINGs are \"B\" and \"Q\". +Valid ENCODINGs are the characters \"B\" and \"Q\". If your Emacs implementation can't decode CHARSET, return nil." (if (stringp charset) (setq charset (intern (downcase charset)))) @@ -670,19 +753,16 @@ If your Emacs implementation can't decode CHARSET, return nil." (when (and (eq cs 'ascii) mail-parse-charset) (setq cs mail-parse-charset)) - ;; Fixme: What's this for? The following comment makes no sense. -- fx - (mm-with-unibyte-current-buffer - ;; In Emacs Mule 4, decoding UTF-8 should be in unibyte mode. - (mm-decode-coding-string - (cond - ((equal "B" encoding) - (base64-decode-string - (rfc2047-pad-base64 string))) - ((equal "Q" encoding) - (quoted-printable-decode-string - (mm-replace-chars-in-string string ?_ ? ))) - (t (error "Invalid encoding: %s" encoding))) - cs))))) + (mm-decode-coding-string + (cond + ((char-equal ?B encoding) + (base64-decode-string + (rfc2047-pad-base64 string))) + ((char-equal ?Q encoding) + (quoted-printable-decode-string + (mm-subst-char-in-string ?_ ? string t))) + (t (error "Invalid encoding: %c" encoding))) + cs)))) (provide 'rfc2047)