1 ;;; rfc2047.el --- functions for encoding and decoding rfc2047 messages
3 ;; Copyright (C) 1998-2011 Free Software Foundation, Inc.
5 ;; Author: Lars Magne Ingebrigtsen <larsi@gnus.org>
6 ;; MORIOKA Tomohiko <morioka@jaist.ac.jp>
7 ;; This file is part of GNU Emacs.
9 ;; GNU Emacs is free software: you can redistribute it and/or modify
10 ;; it under the terms of the GNU General Public License as published by
11 ;; the Free Software Foundation, either version 3 of the License, or
12 ;; (at your option) any later version.
14 ;; GNU Emacs is distributed in the hope that it will be useful,
15 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;; GNU General Public License for more details.
19 ;; You should have received a copy of the GNU General Public License
20 ;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
24 ;; RFC 2047 is "MIME (Multipurpose Internet Mail Extensions) Part
25 ;; Three: Message Header Extensions for Non-ASCII Text".
31 (defvar message-posting-charset)
35 ;; Fixme: Avoid this (used for mail-parse-charset) mm dependence on gnus.
37 (require 'rfc2045) ;; rfc2045-encode-string
38 (autoload 'mm-body-7-or-8 "mm-bodies")
40 (defvar rfc2047-header-encoding-alist
41 '(("Newsgroups" . nil)
44 ("\\(Resent-\\)?\\(From\\|Cc\\|To\\|Bcc\\|\\(In-\\)?Reply-To\\|Sender\
45 \\|Mail-Followup-To\\|Mail-Copies-To\\|Approved\\)" . address-mime)
47 "*Header/encoding method alist.
48 The list is traversed sequentially. The keys can either be
53 1) nil, in which case no encoding is done;
54 2) `mime', in which case the header will be encoded according to RFC2047;
55 3) `address-mime', like `mime', but takes account of the rules for address
56 fields (where quoted strings and comments must be treated separately);
57 4) a charset, in which case it will be encoded as that charset;
58 5) `default', in which case the field will be encoded as the rest
61 (defvar rfc2047-charset-encoding-alist
87 "Alist of MIME charsets to RFC2047 encodings.
88 Valid encodings are nil, `Q' and `B'. These indicate binary (no) encoding,
89 quoted-printable and base64 respectively.")
91 (defvar rfc2047-encode-function-alist
92 '((Q . rfc2047-q-encode-string)
93 (B . rfc2047-b-encode-string)
95 "Alist of RFC2047 encodings to encoding functions.")
97 (defvar rfc2047-encode-encoded-words t
98 "Whether encoded words should be encoded again.")
100 (defvar rfc2047-allow-irregular-q-encoded-words t
101 "*Whether to decode irregular Q-encoded words.")
103 (eval-and-compile ;; Necessary to hard code them in `rfc2047-decode-region'.
104 (defconst rfc2047-encoded-word-regexp
105 "=\\?\\([^][\000-\040()<>@,\;:*\\\"/?.=]+\\)\\(?:\\*[^?]+\\)?\\?\
106 \\(B\\?[+/0-9A-Za-z]*=*\
109 "Regexp that matches encoded word."
110 ;; The patterns for the B encoding and the Q encoding, i.e. the ones
111 ;; beginning with "B" and "Q" respectively, are restricted into only
112 ;; the characters that those encodings may generally use.
114 (defconst rfc2047-encoded-word-regexp-loose
115 "=\\?\\([^][\000-\040()<>@,\;:*\\\"/?.=]+\\)\\(?:\\*[^?]+\\)?\\?\
116 \\(B\\?[+/0-9A-Za-z]*=*\
117 \\|Q\\?\\(?:\\?+[ -<>@-~]\\)?\\(?:[ ->@-~]+\\?+[ -<>@-~]\\)*[ ->@-~]*\\?*\
119 "Regexp that matches encoded word allowing loose Q encoding."
120 ;; The pattern for the Q encoding, i.e. the one beginning with "Q",
122 ;; "Q\\?\\(\\?+[^\n=?]\\)?\\([^\n?]+\\?+[^\n=?]\\)*[^\n?]*\\?*"
123 ;; <--------1-------><----------2,3----------><--4--><-5->
125 ;; 1. After "Q?", allow "?"s that follow a character other than "=".
126 ;; 2. Allow "=" after "Q?"; it isn't regarded as the terminator.
127 ;; 3. In the middle of an encoded word, allow "?"s that follow a
128 ;; character other than "=".
129 ;; 4. Allow any characters other than "?" in the middle of an
131 ;; 5. At the end, allow "?"s.
135 ;;; Functions for encoding RFC2047 messages
138 (defun rfc2047-qp-or-base64 ()
139 "Return the type with which to encode the buffer.
140 This is either `base64' or `quoted-printable'."
142 (let ((limit (min (point-max) (+ 2000 (point-min))))
144 (goto-char (point-min))
145 (skip-chars-forward "\x20-\x7f\r\n\t" limit)
146 (while (< (point) limit)
149 (skip-chars-forward "\x20-\x7f\r\n\t" limit))
150 (if (or (< (* 6 n8bit) (- limit (point-min)))
151 ;; Don't base64, say, a short line with a single
152 ;; non-ASCII char when splitting parts by charset.
157 (defun rfc2047-narrow-to-field ()
158 "Narrow the buffer to the header on the current line."
164 (if (re-search-forward "^[^ \n\t]" nil t)
167 (goto-char (point-min)))
169 (defun rfc2047-field-value ()
170 "Return the value of the field at point."
173 (rfc2047-narrow-to-field)
174 (re-search-forward ":[ \t\n]*" nil t)
175 (buffer-substring-no-properties (point) (point-max)))))
177 (defun rfc2047-quote-special-characters-in-quoted-strings (&optional
179 "Quote special characters with `\\'s in quoted strings.
180 Quoting will not be done in a quoted string if it contains characters
181 matching ENCODABLE-REGEXP or it is within parentheses."
182 (goto-char (point-min))
183 (let ((tspecials (concat "[" ietf-drums-tspecials "]"))
186 (with-syntax-table (standard-syntax-table)
190 (eq (char-before) ?\)))
192 (goto-char (point-max)))
194 (narrow-to-region start (point))
196 (while (search-forward "\"" nil t)
197 (setq beg (match-beginning 0))
198 (unless (eq (char-before beg) ?\\)
204 (setq end (1- (point)))
206 (if (and encodable-regexp
207 (re-search-forward encodable-regexp end t))
210 (narrow-to-region beg end)
211 (while (re-search-forward tspecials nil 'move)
212 (if (eq (char-before) ?\\)
213 (if (looking-at tspecials) ;; Already quoted.
216 (goto-char (match-beginning 0))
222 (goto-char (point-max)))
224 (setq start (point))))))
226 (defvar rfc2047-encoding-type 'address-mime
227 "The type of encoding done by `rfc2047-encode-region'.
228 This should be dynamically bound around calls to
229 `rfc2047-encode-region' to either `mime' or `address-mime'. See
230 `rfc2047-header-encoding-alist', for definitions.")
232 (defun rfc2047-encode-message-header ()
233 "Encode the message header according to `rfc2047-header-encoding-alist'.
234 Should be called narrowed to the head of the message."
237 (goto-char (point-min))
238 (let (alist elem method)
241 (rfc2047-narrow-to-field)
243 alist rfc2047-header-encoding-alist)
244 (while (setq elem (pop alist))
245 (when (or (and (stringp (car elem))
246 (looking-at (car elem)))
250 (if (not (rfc2047-encodable-p))
252 (when (eq method 'address-mime)
253 (rfc2047-quote-special-characters-in-quoted-strings))
254 (if (and (eq (mm-body-7-or-8) '8bit)
257 (car message-posting-charset)))
258 ;; 8 bit must be decoded.
259 (mm-encode-coding-region
260 (point-min) (point-max)
261 (mm-charset-to-coding-system
262 (car message-posting-charset))))
263 ;; No encoding necessary, but folding is nice
267 (goto-char (point-min))
268 (skip-chars-forward "^:")
269 (when (looking-at ": ")
273 ;; We found something that may perhaps be encoded.
274 (re-search-forward "^[^:]+: *" nil t)
276 ((eq method 'address-mime)
277 (rfc2047-encode-region (point) (point-max)))
279 (let ((rfc2047-encoding-type 'mime))
280 (rfc2047-encode-region (point) (point-max))))
281 ((eq method 'default)
282 (if (and (featurep 'mule)
283 (if (boundp 'enable-multibyte-characters)
284 (default-value 'enable-multibyte-characters))
286 (mm-encode-coding-region (point) (point-max)
287 mail-parse-charset)))
288 ;; We get this when CC'ing messsages to newsgroups with
289 ;; 8-bit names. The group name mail copy just got
290 ;; unconditionally encoded. Previously, it would ask
291 ;; whether to encode, which was quite confusing for the
292 ;; user. If the new behavior is wrong, tell me. I have
293 ;; left the old code commented out below.
294 ;; -- Per Abrahamsen <abraham@dina.kvl.dk> Date: 2001-10-07.
295 ;; Modified by Dave Love, with the commented-out code changed
296 ;; in accordance with changes elsewhere.
298 (rfc2047-encode-region (point) (point-max)))
300 ;;; (if (or (message-options-get
301 ;;; 'rfc2047-encode-message-header-encode-any)
302 ;;; (message-options-set
303 ;;; 'rfc2047-encode-message-header-encode-any
305 ;;; "Some texts are not encoded. Encode anyway?")))
306 ;;; (rfc2047-encode-region (point-min) (point-max))
307 ;;; (error "Cannot send unencoded text")))
308 ((mm-coding-system-p method)
309 (if (or (and (featurep 'mule)
310 (if (boundp 'enable-multibyte-characters)
311 (default-value 'enable-multibyte-characters)))
312 (featurep 'file-coding))
313 (mm-encode-coding-region (point) (point-max) method)))
316 (goto-char (point-max)))))))
318 ;; Fixme: This, and the require below may not be the Right Thing, but
319 ;; should be safe just before release. -- fx 2001-02-08
321 (defun rfc2047-encodable-p ()
322 "Return non-nil if any characters in current buffer need encoding in headers.
323 The buffer may be narrowed."
324 (require 'message) ; for message-posting-charset
326 (mm-find-mime-charset-region (point-min) (point-max))))
327 (goto-char (point-min))
328 (or (and rfc2047-encode-encoded-words
330 (re-search-forward rfc2047-encoded-word-regexp nil t)
331 (goto-char (point-min))))
333 (not (equal charsets (list (car message-posting-charset))))))))
335 ;; Use this syntax table when parsing into regions that may need
336 ;; encoding. Double quotes are string delimiters, backslash is
337 ;; character quoting, and all other RFC 2822 special characters are
338 ;; treated as punctuation so we can use forward-sexp/forward-word to
339 ;; skip to the end of regions appropriately. Nb. ietf-drums does
340 ;; things differently.
341 (defconst rfc2047-syntax-table
342 ;; (make-char-table 'syntax-table '(2)) only works in Emacs.
343 (let ((table (make-syntax-table)))
344 ;; The following is done to work for setting all elements of the table;
345 ;; it appears to be the cleanest way.
346 ;; Play safe and don't assume the form of the word syntax entry --
348 (if (featurep 'xemacs)
349 (put-char-table t (get-char-table ?a (standard-syntax-table)) table)
350 (set-char-table-range table t (aref (standard-syntax-table) ?a)))
351 (modify-syntax-entry ?\\ "\\" table)
352 (modify-syntax-entry ?\" "\"" table)
353 (modify-syntax-entry ?\( "(" table)
354 (modify-syntax-entry ?\) ")" table)
355 (modify-syntax-entry ?\< "." table)
356 (modify-syntax-entry ?\> "." table)
357 (modify-syntax-entry ?\[ "." table)
358 (modify-syntax-entry ?\] "." table)
359 (modify-syntax-entry ?: "." table)
360 (modify-syntax-entry ?\; "." table)
361 (modify-syntax-entry ?, "." table)
362 (modify-syntax-entry ?@ "." table)
365 (defun rfc2047-encode-region (b e)
366 "Encode words in region B to E that need encoding.
367 By default, the region is treated as containing RFC2822 addresses.
368 Dynamically bind `rfc2047-encoding-type' to change that."
370 (narrow-to-region b e)
371 (let ((encodable-regexp (if rfc2047-encode-encoded-words
372 "[^\000-\177]+\\|=\\?"
374 start ; start of current token
376 ;; Whether there's an encoded word before the current token,
377 ;; either immediately or separated by space.
379 (orig-text (buffer-substring-no-properties b e)))
380 (if (eq 'mime rfc2047-encoding-type)
381 ;; Simple case. Continuous words in which all those contain
382 ;; non-ASCII characters are encoded collectively. Encoding
383 ;; ASCII words, including `Re:' used in Subject headers, is
384 ;; avoided for interoperability with non-MIME clients and
385 ;; for making it easy to find keywords.
387 (goto-char (point-min))
388 (while (progn (skip-chars-forward " \t\n")
391 (while (and (looking-at "[ \t\n]*\\([^ \t\n]+\\)")
393 (setq end (match-end 0))
394 (re-search-forward encodable-regexp end t)))
396 (if (> (point) start)
397 (rfc2047-encode start (point))
399 ;; `address-mime' case -- take care of quoted words, comments.
400 (rfc2047-quote-special-characters-in-quoted-strings encodable-regexp)
401 (with-syntax-table rfc2047-syntax-table
402 (goto-char (point-min))
403 (condition-case err ; in case of unbalanced quotes
404 ;; Look for rfc2822-style: sequences of atoms, quoted
405 ;; strings, specials, whitespace. (Specials mustn't be
409 (skip-chars-forward " \t\n")
412 ((not (char-after))) ; eob
414 ((eq ?\" (setq csyntax (char-syntax (char-after))))
418 ;; Does it need encoding?
420 (if (re-search-forward encodable-regexp end 'move)
421 ;; It needs encoding. Strip the quotes first,
422 ;; since encoded words can't occur in quotes.
429 ;; There was a preceding quoted word. We need
430 ;; to include any separating whitespace in this
431 ;; word to avoid it getting lost.
432 (skip-chars-backward " \t")
433 ;; A space is needed between the encoded words.
437 ;; Adjust the end position for the deleted quotes.
438 (rfc2047-encode start (- end 2))
439 (setq last-encoded t)) ; record that it was encoded
440 (setq last-encoded nil)))
442 ;; Skip other delimiters, but record that they've
443 ;; potentially separated quoted words.
445 (setq last-encoded nil))
447 (error "Unbalanced parentheses"))
449 ;; Look for the end of parentheses.
451 ;; Encode text as an unstructured field.
452 (let ((rfc2047-encoding-type 'mime))
453 (rfc2047-encode-region (1+ start) (1- (point))))
454 (skip-chars-forward ")"))
455 (t ; normal token/whitespace sequence
457 ;; Skip one ASCII word, or encode continuous words
458 ;; in which all those contain non-ASCII characters.
460 (while (not (or end (eobp)))
461 (when (looking-at "[\000-\177]+")
465 (while (and (or (re-search-forward
466 "[ \t\n]\\|\\Sw" end 'move)
468 (eq ?\\ (char-syntax (char-before))))
469 ;; Skip backslash-quoted characters.
472 (setq end (match-beginning 0))
473 (if rfc2047-encode-encoded-words
476 (when (search-forward "=?" end 'move)
477 (goto-char (match-beginning 0))
480 ;; Where the value nil of `end' means there may be
481 ;; text to have to be encoded following the point.
482 ;; Otherwise, the point reached to the end of ASCII
483 ;; words separated by whitespace or a special char.
485 (when (looking-at encodable-regexp)
486 (goto-char (setq begin (match-end 0)))
487 (while (and (looking-at "[ \t\n]+\\([^ \t\n]+\\)")
488 (setq end (match-end 0))
490 (while (re-search-forward
491 encodable-regexp end t))
494 (or (not (re-search-forward "\\Sw" end t))
496 (goto-char (match-beginning 0))
499 (when (looking-at "[^ \t\n]+")
500 (setq end (match-end 0))
501 (if (re-search-forward "\\Sw+" end t)
502 ;; There are special characters better
503 ;; to be encoded so that MTAs may parse
505 (cond ((= end (point)))
506 ((looking-at (concat "\\sw*\\("
511 (goto-char (1- (match-end 0)))
512 (unless (= (point) (match-beginning 0))
513 ;; Separate encodable text and