1 ;;; rfc2047.el --- functions for encoding and decoding rfc2047 messages
3 ;; Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006,
4 ;; 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
6 ;; Author: Lars Magne Ingebrigtsen <larsi@gnus.org>
7 ;; MORIOKA Tomohiko <morioka@jaist.ac.jp>
8 ;; This file is part of GNU Emacs.
10 ;; GNU Emacs is free software: you can redistribute it and/or modify
11 ;; it under the terms of the GNU General Public License as published by
12 ;; the Free Software Foundation, either version 3 of the License, or
13 ;; (at your option) any later version.
15 ;; GNU Emacs is distributed in the hope that it will be useful,
16 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
17 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 ;; GNU General Public License for more details.
20 ;; You should have received a copy of the GNU General Public License
21 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
25 ;; RFC 2047 is "MIME (Multipurpose Internet Mail Extensions) Part
26 ;; Three: Message Header Extensions for Non-ASCII Text".
32 (defvar message-posting-charset)
36 ;; Fixme: Avoid this (used for mail-parse-charset) mm dependence on gnus.
38 (require 'rfc2045) ;; rfc2045-encode-string
39 (autoload 'mm-body-7-or-8 "mm-bodies")
41 (defvar rfc2047-header-encoding-alist
42 '(("Newsgroups" . nil)
45 ("\\(Resent-\\)?\\(From\\|Cc\\|To\\|Bcc\\|\\(In-\\)?Reply-To\\|Sender\
46 \\|Mail-Followup-To\\|Mail-Copies-To\\|Approved\\)" . address-mime)
48 "*Header/encoding method alist.
49 The list is traversed sequentially. The keys can either be
54 1) nil, in which case no encoding is done;
55 2) `mime', in which case the header will be encoded according to RFC2047;
56 3) `address-mime', like `mime', but takes account of the rules for address
57 fields (where quoted strings and comments must be treated separately);
58 4) a charset, in which case it will be encoded as that charset;
59 5) `default', in which case the field will be encoded as the rest
62 (defvar rfc2047-charset-encoding-alist
88 "Alist of MIME charsets to RFC2047 encodings.
89 Valid encodings are nil, `Q' and `B'. These indicate binary (no) encoding,
90 quoted-printable and base64 respectively.")
92 (defvar rfc2047-encode-function-alist
93 '((Q . rfc2047-q-encode-string)
94 (B . rfc2047-b-encode-string)
96 "Alist of RFC2047 encodings to encoding functions.")
98 (defvar rfc2047-encode-encoded-words t
99 "Whether encoded words should be encoded again.")
101 (defvar rfc2047-allow-irregular-q-encoded-words t
102 "*Whether to decode irregular Q-encoded words.")
104 (eval-and-compile ;; Necessary to hard code them in `rfc2047-decode-region'.
105 (defconst rfc2047-encoded-word-regexp
106 "=\\?\\([^][\000-\040()<>@,\;:*\\\"/?.=]+\\)\\(?:\\*[^?]+\\)?\\?\
107 \\(B\\?[+/0-9A-Za-z]*=*\
110 "Regexp that matches encoded word."
111 ;; The patterns for the B encoding and the Q encoding, i.e. the ones
112 ;; beginning with "B" and "Q" respectively, are restricted into only
113 ;; the characters that those encodings may generally use.
115 (defconst rfc2047-encoded-word-regexp-loose
116 "=\\?\\([^][\000-\040()<>@,\;:*\\\"/?.=]+\\)\\(?:\\*[^?]+\\)?\\?\
117 \\(B\\?[+/0-9A-Za-z]*=*\
118 \\|Q\\?\\(?:\\?+[ -<>@-~]\\)?\\(?:[ ->@-~]+\\?+[ -<>@-~]\\)*[ ->@-~]*\\?*\
120 "Regexp that matches encoded word allowing loose Q encoding."
121 ;; The pattern for the Q encoding, i.e. the one beginning with "Q",
123 ;; "Q\\?\\(\\?+[^\n=?]\\)?\\([^\n?]+\\?+[^\n=?]\\)*[^\n?]*\\?*"
124 ;; <--------1-------><----------2,3----------><--4--><-5->
126 ;; 1. After "Q?", allow "?"s that follow a character other than "=".
127 ;; 2. Allow "=" after "Q?"; it isn't regarded as the terminator.
128 ;; 3. In the middle of an encoded word, allow "?"s that follow a
129 ;; character other than "=".
130 ;; 4. Allow any characters other than "?" in the middle of an
132 ;; 5. At the end, allow "?"s.
136 ;;; Functions for encoding RFC2047 messages
139 (defun rfc2047-qp-or-base64 ()
140 "Return the type with which to encode the buffer.
141 This is either `base64' or `quoted-printable'."
143 (let ((limit (min (point-max) (+ 2000 (point-min))))
145 (goto-char (point-min))
146 (skip-chars-forward "\x20-\x7f\r\n\t" limit)
147 (while (< (point) limit)
150 (skip-chars-forward "\x20-\x7f\r\n\t" limit))
151 (if (or (< (* 6 n8bit) (- limit (point-min)))
152 ;; Don't base64, say, a short line with a single
153 ;; non-ASCII char when splitting parts by charset.
158 (defun rfc2047-narrow-to-field ()
159 "Narrow the buffer to the header on the current line."
165 (if (re-search-forward "^[^ \n\t]" nil t)
168 (goto-char (point-min)))
170 (defun rfc2047-field-value ()
171 "Return the value of the field at point."
174 (rfc2047-narrow-to-field)
175 (re-search-forward ":[ \t\n]*" nil t)
176 (buffer-substring-no-properties (point) (point-max)))))
178 (defun rfc2047-quote-special-characters-in-quoted-strings (&optional
180 "Quote special characters with `\\'s in quoted strings.
181 Quoting will not be done in a quoted string if it contains characters
182 matching ENCODABLE-REGEXP or it is within parentheses."
183 (goto-char (point-min))
184 (let ((tspecials (concat "[" ietf-drums-tspecials "]"))
187 (with-syntax-table (standard-syntax-table)
191 (eq (char-before) ?\)))
193 (goto-char (point-max)))
195 (narrow-to-region start (point))
197 (while (search-forward "\"" nil t)
198 (setq beg (match-beginning 0))
199 (unless (eq (char-before beg) ?\\)
205 (setq end (1- (point)))
207 (if (and encodable-regexp
208 (re-search-forward encodable-regexp end t))
211 (narrow-to-region beg end)
212 (while (re-search-forward tspecials nil 'move)
213 (if (eq (char-before) ?\\)
214 (if (looking-at tspecials) ;; Already quoted.
217 (goto-char (match-beginning 0))
223 (goto-char (point-max)))
225 (setq start (point))))))
227 (defvar rfc2047-encoding-type 'address-mime
228 "The type of encoding done by `rfc2047-encode-region'.
229 This should be dynamically bound around calls to
230 `rfc2047-encode-region' to either `mime' or `address-mime'. See
231 `rfc2047-header-encoding-alist', for definitions.")
233 (defun rfc2047-encode-message-header ()
234 "Encode the message header according to `rfc2047-header-encoding-alist'.
235 Should be called narrowed to the head of the message."
238 (goto-char (point-min))
239 (let (alist elem method)
242 (rfc2047-narrow-to-field)
244 alist rfc2047-header-encoding-alist)
245 (while (setq elem (pop alist))
246 (when (or (and (stringp (car elem))
247 (looking-at (car elem)))
251 (if (not (rfc2047-encodable-p))
253 (when (eq method 'address-mime)
254 (rfc2047-quote-special-characters-in-quoted-strings))
255 (if (and (eq (mm-body-7-or-8) '8bit)
258 (car message-posting-charset)))
259 ;; 8 bit must be decoded.
260 (mm-encode-coding-region
261 (point-min) (point-max)
262 (mm-charset-to-coding-system
263 (car message-posting-charset))))
264 ;; No encoding necessary, but folding is nice
268 (goto-char (point-min))
269 (skip-chars-forward "^:")
270 (when (looking-at ": ")
274 ;; We found something that may perhaps be encoded.
275 (re-search-forward "^[^:]+: *" nil t)
277 ((eq method 'address-mime)
278 (rfc2047-encode-region (point) (point-max)))
280 (let ((rfc2047-encoding-type 'mime))
281 (rfc2047-encode-region (point) (point-max))))
282 ((eq method 'default)
283 (if (and (featurep 'mule)
284 (if (boundp 'enable-multibyte-characters)
285 (default-value 'enable-multibyte-characters))
287 (mm-encode-coding-region (point) (point-max)
288 mail-parse-charset)))
289 ;; We get this when CC'ing messsages to newsgroups with
290 ;; 8-bit names. The group name mail copy just got
291 ;; unconditionally encoded. Previously, it would ask
292 ;; whether to encode, which was quite confusing for the
293 ;; user. If the new behavior is wrong, tell me. I have
294 ;; left the old code commented out below.
295 ;; -- Per Abrahamsen <abraham@dina.kvl.dk> Date: 2001-10-07.
296 ;; Modified by Dave Love, with the commented-out code changed
297 ;; in accordance with changes elsewhere.
299 (rfc2047-encode-region (point) (point-max)))
301 ;;; (if (or (message-options-get
302 ;;; 'rfc2047-encode-message-header-encode-any)
303 ;;; (message-options-set
304 ;;; 'rfc2047-encode-message-header-encode-any
306 ;;; "Some texts are not encoded. Encode anyway?")))
307 ;;; (rfc2047-encode-region (point-min) (point-max))
308 ;;; (error "Cannot send unencoded text")))
309 ((mm-coding-system-p method)
310 (if (or (and (featurep 'mule)
311 (if (boundp 'enable-multibyte-characters)
312 (default-value 'enable-multibyte-characters)))
313 (featurep 'file-coding))
314 (mm-encode-coding-region (point) (point-max) method)))
317 (goto-char (point-max)))))))
319 ;; Fixme: This, and the require below may not be the Right Thing, but
320 ;; should be safe just before release. -- fx 2001-02-08
322 (defun rfc2047-encodable-p ()
323 "Return non-nil if any characters in current buffer need encoding in headers.
324 The buffer may be narrowed."
325 (require 'message) ; for message-posting-charset
327 (mm-find-mime-charset-region (point-min) (point-max))))
328 (goto-char (point-min))
329 (or (and rfc2047-encode-encoded-words
331 (re-search-forward rfc2047-encoded-word-regexp nil t)
332 (goto-char (point-min))))
334 (not (equal charsets (list (car message-posting-charset))))))))
336 ;; Use this syntax table when parsing into regions that may need
337 ;; encoding. Double quotes are string delimiters, backslash is
338 ;; character quoting, and all other RFC 2822 special characters are
339 ;; treated as punctuation so we can use forward-sexp/forward-word to
340 ;; skip to the end of regions appropriately. Nb. ietf-drums does
341 ;; things differently.
342 (defconst rfc2047-syntax-table
343 ;; (make-char-table 'syntax-table '(2)) only works in Emacs.
344 (let ((table (make-syntax-table)))
345 ;; The following is done to work for setting all elements of the table
346 ;; in Emacs 21-23 and XEmacs; it appears to be the cleanest way.
347 ;; Play safe and don't assume the form of the word syntax entry --
349 (if (fboundp 'set-char-table-range) ; Emacs
350 (funcall (intern "set-char-table-range")
351 table t (aref (standard-syntax-table) ?a))
352 (if (fboundp 'put-char-table)
353 (if (fboundp 'get-char-table) ; warning avoidance
354 (put-char-table t (get-char-table ?a (standard-syntax-table))
356 (modify-syntax-entry ?\\ "\\" table)
357 (modify-syntax-entry ?\" "\"" table)
358 (modify-syntax-entry ?\( "(" table)
359 (modify-syntax-entry ?\) ")" table)
360 (modify-syntax-entry ?\< "." table)
361 (modify-syntax-entry ?\> "." table)
362 (modify-syntax-entry ?\[ "." table)
363 (modify-syntax-entry ?\] "." table)
364 (modify-syntax-entry ?: "." table)
365 (modify-syntax-entry ?\; "." table)
366 (modify-syntax-entry ?, "." table)
367 (modify-syntax-entry ?@ "." table)
370 (defun rfc2047-encode-region (b e)
371 "Encode words in region B to E that need encoding.
372 By default, the region is treated as containing RFC2822 addresses.
373 Dynamically bind `rfc2047-encoding-type' to change that."
375 (narrow-to-region b e)
376 (let ((encodable-regexp (if rfc2047-encode-encoded-words
377 "[^\000-\177]+\\|=\\?"
379 start ; start of current token
381 ;; Whether there's an encoded word before the current token,
382 ;; either immediately or separated by space.
384 (orig-text (buffer-substring-no-properties b e)))
385 (if (eq 'mime rfc2047-encoding-type)
386 ;; Simple case. Continuous words in which all those contain
387 ;; non-ASCII characters are encoded collectively. Encoding
388 ;; ASCII words, including `Re:' used in Subject headers, is
389 ;; avoided for interoperability with non-MIME clients and
390 ;; for making it easy to find keywords.
392 (goto-char (point-min))
393 (while (progn (skip-chars-forward " \t\n")
396 (while (and (looking-at "[ \t\n]*\\([^ \t\n]+\\)")
398 (setq end (match-end 0))
399 (re-search-forward encodable-regexp end t)))
401 (if (> (point) start)
402 (rfc2047-encode start (point))
404 ;; `address-mime' case -- take care of quoted words, comments.
405 (rfc2047-quote-special-characters-in-quoted-strings encodable-regexp)
406 (with-syntax-table rfc2047-syntax-table
407 (goto-char (point-min))
408 (condition-case err ; in case of unbalanced quotes
409 ;; Look for rfc2822-style: sequences of atoms, quoted
410 ;; strings, specials, whitespace. (Specials mustn't be
414 (skip-chars-forward " \t\n")
417 ((not (char-after))) ; eob
419 ((eq ?\" (setq csyntax (char-syntax (char-after))))
423 ;; Does it need encoding?
425 (if (re-search-forward encodable-regexp end 'move)
426 ;; It needs encoding. Strip the quotes first,
427 ;; since encoded words can't occur in quotes.
434 ;; There was a preceding quoted word. We need
435 ;; to include any separating whitespace in this
436 ;; word to avoid it getting lost.
437 (skip-chars-backward " \t")
438 ;; A space is needed between the encoded words.
442 ;; Adjust the end position for the deleted quotes.
443 (rfc2047-encode start (- end 2))
444 (setq last-encoded t)) ; record that it was encoded
445 (setq last-encoded nil)))
447 ;; Skip other delimiters, but record that they've
448 ;; potentially separated quoted words.
450 (setq last-encoded nil))
452 (error "Unbalanced parentheses"))
454 ;; Look for the end of parentheses.
456 ;; Encode text as an unstructured field.
457 (let ((rfc2047-encoding-type 'mime))
458 (rfc2047-encode-region (1+ start) (1- (point))))
459 (skip-chars-forward ")"))
460 (t ; normal token/whitespace sequence
462 ;; Skip one ASCII word, or encode continuous words
463 ;; in which all those contain non-ASCII characters.
465 (while (not (or end (eobp)))
466 (when (looking-at "[\000-\177]+")
470 (while (and (or (re-search-forward
471 "[ \t\n]\\|\\Sw" end 'move)
473 (eq ?\\ (char-syntax (char-before))))
474 ;; Skip backslash-quoted characters.
477 (setq end (match-beginning 0))
478 (if rfc2047-encode-encoded-words
481 (when (search-forward "=?" end 'move)
482 (goto-char (match-beginning 0))
485 ;; Where the value nil of `end' means there may be
486 ;; text to have to be encoded following the point.
487 ;; Otherwise, the point reached to the end of ASCII
488 ;; words separated by whitespace or a special char.
490 (when (looking-at encodable-regexp)
491 (goto-char (setq begin (match-end 0)))
492 (while (and (looking-at "[ \t\n]+\\([^ \t\n]+\\)")
493 (setq end (match-end 0))
495 (while (re-search-forward
496 encodable-regexp end t))
499 (or (not (re-search-forward "\\Sw" end t))
501 (goto-char (match-beginning 0))
504 (when (looking-at "[^ \t\n]+")
505 (setq end (match-end 0))
506 (if (re-search-forward "\\Sw+" end t)
507 ;; There are special characters better
508 ;; to be encoded so that MTAs may parse
510 (cond ((= end (point)))
511 ((looking-at (concat "\\sw*\\("
516 (goto-char (1- (match-end 0)))
517 (unless (= (point) (match-beginning 0))
518 ;; Separate encodable text and
522 (skip-chars-forward " \t\n")
523 (if (and (looking-at "[^ \t\n]+")
524 (string-match encodable-regexp
527 (goto-char end)))))))
528 (skip-chars-backward " \t\n")
531 (if (re-search-forward encodable-regexp end 'move)
533 (unless (memq (char-before start) '(nil ?\t ? ))
536 (skip-chars-backward "^ \t\n")
537 (and (looking-at "\\Sw+")
538 (= (match-end 0) start)))
539 ;; Also encode bogus delimiters.
541 ;; Separate encodable text and delimiter.
544 (setq start (1+ start)
546 (rfc2047-encode start end)
547 (setq last-encoded t))
548 (setq last-encoded nil)))))
550 (if (or debug-on-quit debug-on-error)
551 (signal (car err) (cdr err))
552 (error "Invalid data for rfc2047 encoding: %s"
553 (mm-replace-in-string orig-text "[ \t\n]+" " "))))))))
554 (rfc2047-fold-region b (point))
555 (goto-char (point-max))))
557 (defun rfc2047-encode-string (string)
558 "Encode words in STRING.
559 By default, the string is treated as containing addresses (see
560 `rfc2047-encoding-type')."
561 (mm-with-multibyte-buffer
563 (rfc2047-encode-region (point-min) (point-max))
567 ;; 2. Syntax of encoded-words
569 ;; While there is no limit to the length of a multiple-line header
570 ;; field, each line of a header field that contains one or more
571 ;; 'encoded-word's is limited to 76 characters.
573 ;; In `rfc2047-encode-parameter' it is bound to nil, so don't defconst it.
574 (defvar rfc2047-encode-max-chars 76
575 "Maximum characters of each header line that contain encoded-words.
576 According to RFC 2047, it is 76. If it is nil, encoded-words
577 will not be folded. Too small value may cause an error. You
578 should not change this value.")
580 (defun rfc2047-encode-1 (column string cs encoder start crest tail
582 "Subroutine used by `rfc2047-encode'."
583 (cond ((string-equal string "")
585 ((not rfc2047-encode-max-chars)
587 (funcall encoder (if cs
588 (mm-encode-coding-string string cs)
591 ((>= column rfc2047-encode-max-chars)
593 (cond ((string-match "\n[ \t]+\\'" eword)
594 ;; Reomove a superfluous empty line.
595 (setq eword (substring eword 0 (match-beginning 0))))
596 ((string-match "(+\\'" eword)
597 ;; Break the line before the open parenthesis.
598 (setq crest (concat crest (match-string 0 eword))
599 eword (substring eword 0 (match-beginning 0))))))
600 (rfc2047-encode-1 (length crest) string cs encoder start " " tail
601 (concat eword "\n" crest)))
604 (limit (1- (length string)))
609 (setq next (concat start
612 (mm-encode-coding-string
613 (substring string 0 (1+ index))
615 (substring string 0 (1+ index))))
617 len (+ column (length next)))
618 (if (> len rfc2047-encode-max-chars)
621 (if (or (< index limit)
622 (<= (+ len (or (string-match "\n" tail)
624 rfc2047-encode-max-chars))
627 (if (string-match "\\`)+" tail)
628 ;; Break the line after the close parenthesis.
629 (setq tail (concat (substring tail 0 (match-end 0))
631 (substring tail (match-end 0)))
637 (concat eword next tail)
640 (string-match "(+\\'" eword))
641 (setq crest (concat crest (match-string 0 eword))
642 eword (substring eword 0 (match-beginning 0)))
643 (setq eword (concat eword next)))
645 eword (concat eword next)))
646 (when (string-match "\n[ \t]+\\'" eword)
647 ;; Reomove a superfluous empty line.
648 (setq eword (substring eword 0 (match-beginning 0))))
649 (rfc2047-encode-1 (length crest) (substring string index)
650 cs encoder start " " tail
651 (concat eword "\n" crest)))))))
653 (defun rfc2047-encode (b e)
654 "Encode the word(s) in the region B to E.
655 Point moves to the end of the region."
656 (let ((mime-charset (or (mm-find-mime-charset-region b e) (list 'us-ascii)))
657 cs encoding tail crest eword)
658 ;; Use utf-8 as a last resort if determining charset of text fails.
659 (if (memq nil mime-charset)
660 (setq mime-charset (list 'utf-8)))
661 (cond ((> (length mime-charset) 1)
662 (error "Can't rfc2047-encode `%s'"
663 (buffer-substring-no-properties b e)))
664 ((= (length mime-charset) 1)
665 (setq mime-charset (car mime-charset)
666 cs (mm-charset-to-coding-system mime-charset))
667 (unless (and (mm-multibyte-p)
668 (mm-coding-system-p cs))
671 (narrow-to-region b e)
673 (or (cdr (assq mime-charset
674 rfc2047-charset-encoding-alist))
675 ;; For the charsets that don't have a preferred
676 ;; encoding, choose the one that's shorter.
677 (if (eq (rfc2047-qp-or-base64) 'base64)
682 (skip-chars-forward "^ \t\n")
683 ;; `tail' may contain a close parenthesis.
684 (setq tail (buffer-substring-no-properties e (point)))
686 (setq b (point-marker)
687 e (set-marker (make-marker) e))
688 (rfc2047-fold-region (point-at-bol) b)
690 (skip-chars-backward "^ \t\n")
691 (unless (= 0 (skip-chars-backward " \t"))
692 ;; `crest' may contain whitespace and an open parenthesis.
693 (setq crest (buffer-substring-no-properties (point) b)))
694 (setq eword (rfc2047-encode-1
696 (mm-replace-in-string
697 (buffer-substring-no-properties b e)
698 "\n\\([ \t]?\\)" "\\1")
700 (or (cdr (assq encoding
701 rfc2047-encode-function-alist))
703 (concat "=?" (downcase (symbol-name mime-charset))
704 "?" (upcase (symbol-name encoding)) "?")
707 (delete-region (if (eq (aref eword 0) ?\n)
709 ;; The line was folded before encoding.
714 ;; `eword' contains `crest' and `tail'.
718 (unless (or (/= 0 (length tail))
720 (looking-at "[ \t\n)]"))
725 (defun rfc2047-fold-field ()
726 "Fold the current header field."
729 (rfc2047-narrow-to-field)
730 (rfc2047-fold-region (point-min) (point-max)))))
732 (defun rfc2047-fold-region (b e)
733 "Fold long lines in region B to E."
735 (narrow-to-region b e)
736 (goto-char (point-min))
740 (bol (save-restriction
744 (when (and (or break qword-break)
745 (> (- (point) bol) 76))
746 (goto-char (or break qword-break))
749 (skip-chars-backward " \t")
750 (if (looking-at "[ \t]")
753 (setq bol (1- (point)))
754 ;; Don't break before the first non-LWSP characters.
755 (skip-chars-forward " \t")
759 ((eq (char-after) ?\n)
764 (skip-chars-forward " \t")
765 (unless (or (eobp) (eq (char-after) ?\n))
767 ((eq (char-after) ?\r)
769 ((memq (char-after) '(? ?\t))
770 (skip-chars-forward " \t")
771 (unless first ;; Don't break just after the header name.
772 (setq break (point))))
774 (if (not (looking-at "=\\?[^=]"))
775 (if (eq (char-after) ?=)
777 (skip-chars-forward "^ \t\n\r="))
778 ;; Don't break at the start of the field.
779 (unless (= (point) b)
780 (setq qword-break (point)))
781 (skip-chars-forward "^ \t\n\r")))
783 (skip-chars-forward "^ \t\n\r")))
785 (when (and (or break qword-break)
786 (> (- (point) bol) 76))
787 (goto-char (or break qword-break))