1 ;;; spam.el --- Identifying spam
2 ;; Copyright (C) 2002 Free Software Foundation, Inc.
4 ;; Author: Lars Magne Ingebrigtsen <larsi@gnus.org>
7 ;; This file is part of GNU Emacs.
9 ;; GNU Emacs is free software; you can redistribute it and/or modify
10 ;; it under the terms of the GNU General Public License as published by
11 ;; the Free Software Foundation; either version 2, or (at your option)
14 ;; GNU Emacs is distributed in the hope that it will be useful,
15 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;; GNU General Public License for more details.
19 ;; You should have received a copy of the GNU General Public License
20 ;; along with GNU Emacs; see the file COPYING. If not, write to the
21 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 ;; Boston, MA 02111-1307, USA.
26 ;;; This module addresses a few aspects of spam control under Gnus. Page
27 ;;; breaks are used for grouping declarations and documentation relating to
28 ;;; each particular aspect.
30 ;;; The integration with Gnus is not yet complete. See various `FIXME'
31 ;;; comments, below, for supplementary explanations or discussions.
37 ;; FIXME! We should not require `dns' nor `message' until we actually
38 ;; need them. Best would be to declare needed functions as auto-loadable.
44 (defvar spam-use-blacklist t
45 "True if the blacklist should be used.")
47 (defvar spam-use-whitelist t
48 "True if the whitelist should be used.")
50 (defvar spam-use-blackholes nil
51 ;; FIXME! Turned off for now. The DNS routines are said to be flaky.
52 "True if blackholes should be used.")
54 (defvar spam-use-bogofilter t
55 "True if bogofilter should be used.")
57 (defvar spam-split-group "spam"
58 "Usual group name where spam should be split.")
60 (defvar spam-junk-mailgroups
61 ;; FIXME! The mailgroup list evidently depends on other choices made by the
62 ;; user, so the built-in default below is not likely to be appropriate.
63 (cons spam-split-group '("mail.junk" "poste.pourriel"))
64 "Mailgroups which are dedicated by splitting to receive various junk.
65 All unmarked article in such group receive the spam mark on group entry.")
67 ;; FIXME! For `spam-ham-marks' and `spam-spam-marks', I wonder if it would
68 ;; not be easier for the user to just accept a string of mark letters, instead
69 ;; of a list of Gnus variable names. In such case, the stunt of deferred
70 ;; evaluation would not be useful anymore. Lars?? :-)
72 ;; FIXME! It is rather questionable to see `K', `X' and `Y' as indicating
73 ;; positive ham. It much depends on how and why people use kill files, score
74 ;; files, and the kill command. Maybe it would be better, by default, to not
75 ;; process a message neither as ham nor spam, that is, just ignore it for
76 ;; learning purposes, when we are not sure of how the user sees it.
77 ;; But `r' and `R' should undoubtedly be seen as ham.
79 ;; FIXME! Some might consider overkill to define a list of spam marks. On
80 ;; the other hand, who knows, some users might for example like that
81 ;; explicitly `E'xpired articles be processed as positive spam.
83 (defvar spam-ham-marks
84 (list gnus-del-mark gnus-read-mark gnus-killed-mark
85 gnus-kill-file-mark gnus-low-score-mark)
86 "Marks considered as being ham (positively not spam).
87 Such articles will be transmitted to `bogofilter -n' on group exit.")
89 (defvar spam-spam-marks
91 "Marks considered as being spam (positively spam).
92 Such articles will be transmitted to `bogofilter -s' on group exit.")
94 ;; FIXME! Ideally, the remainder of this page should be fully integrated
95 ;; within `gnus-sum.el'.
97 ;;; Key bindings for spam control.
99 ;; FIXME! The justification for `M-d' is that this is what Paul Graham
100 ;; suggests in his original article, and what Eric Raymond's patch for Mutt
101 ;; uses. But more importantly, that binding was still free in Summary mode!
103 ;; FIXME! Lars has not blessed the following key bindings yet. It looks
104 ;; convenient that the score analysis command uses a sequence ending with the
105 ;; letter `t', so it nicely parallels `B t' or `V t'. `M-d' is a kind of
106 ;; "alternate" `d', it is also the sequence suggested in Paul Graham article,
107 ;; and also in Eric Raymond's patch for Mutt. `S x' might be the more
108 ;; official key binding for `M-d'.
110 (gnus-define-keys gnus-summary-mode-map
111 "St" spam-bogofilter-score
112 "Sx" gnus-summary-mark-as-spam
113 "\M-d" gnus-summary-mark-as-spam)
115 ;;; How to highlight a spam summary line.
117 ;; FIXME! Of course, `gnus-splash-face' has another purpose. Maybe a
118 ;; special face should be created, named and used instead, for spam lines.
120 (push '((eq mark gnus-spam-mark) . gnus-splash-face)
121 gnus-summary-highlight)
123 ;;; Hooks dispatching. A bit raw for now.
125 (defun spam-summary-prepare ()
126 (spam-mark-junk-as-spam-routine))
128 (defun spam-summary-prepare-exit ()
129 (spam-bogofilter-register-routine))
131 (add-hook 'gnus-summary-prepare-hook 'spam-summary-prepare)
132 (add-hook 'gnus-summary-prepare-exit-hook 'spam-summary-prepare-exit)
134 (defun spam-mark-junk-as-spam-routine ()
135 (when (member gnus-newsgroup-name spam-junk-mailgroups)
136 (let ((articles gnus-newsgroup-articles)
139 (setq article (pop articles))
140 (when (eq (gnus-summary-article-mark article) gnus-unread-mark)
141 (gnus-summary-mark-article article gnus-spam-mark))))))
143 ;;;; Spam determination.
145 ;; The following list contains pairs associating a parameter variable with a
146 ;; spam checking function. If the parameter variable is true, then the
147 ;; checking function is called, and its value decides what happens. Each
148 ;; individual check may return `nil', `t', or a mailgroup name. The value
149 ;; `nil' means that the check does not yield a decision, and so, that further
150 ;; checks are needed. The value `t' means that the message is definitely not
151 ;; spam, and that further spam checks should be inhibited. Otherwise, a
152 ;; mailgroup name is returned where the mail should go, and further checks are
153 ;; also inhibited. The usual mailgroup name is the value of
154 ;; `spam-split-group', meaning that the message is definitely a spam.
156 (defvar spam-list-of-checks
157 '((spam-use-blacklist . spam-check-blacklist)
158 (spam-use-whitelist . spam-check-whitelist)
159 (spam-use-bbdb . spam-check-bbdb)
160 (spam-use-blackholes . spam-check-blackholes)
161 (spam-use-bogofilter . spam-check-bogofilter)))
164 "Split this message into the `spam' group if it is spam.
165 This function can be used as an entry in `nnmail-split-fancy', for
166 example like this: (: spam-split)
168 See the Info node `(gnus)Fancy Mail Splitting' for more details."
171 (let ((list-of-checks spam-list-of-checks)
173 (while (and list-of-checks (not decision))
174 (let ((pair (pop list-of-checks)))
175 (when (eval (car pair))
176 (setq decision (apply (cdr pair))))))
183 (defvar spam-blackhole-servers '("bl.spamcop.net"
188 "List of blackhole servers.")
190 (defun spam-check-blackholes ()
191 "Check the Receieved headers for blackholed relays."
192 (let ((headers (message-fetch-field "received"))
197 (goto-char (point-min))
198 (while (re-search-forward
199 "\\[\\([0-9]+.[0-9]+.[0-9]+.[0-9]+\\)\\]" nil t)
200 (message "Blackhole search found host IP %s." (match-string 1))
201 (push (mapconcat 'identity
202 (nreverse (split-string (match-string 1) "\\."))
205 (dolist (server spam-blackhole-servers)
207 (when (query-dns (concat ip "." server))
208 (push (list ip server (query-dns (concat ip "." server) 'TXT))
213 ;;;; Blacklists and whitelists.
215 (defvar spam-directory "~/News/spam/"
216 "When spam files are kept.")
218 (defvar spam-whitelist (expand-file-name "whitelist" spam-directory)
219 "The location of the whitelist.
220 The file format is one regular expression per line.
221 The regular expression is matched against the address.")
223 (defvar spam-blacklist (expand-file-name "blacklist" spam-directory)
224 "The location of the blacklist.
225 The file format is one regular expression per line.
226 The regular expression is matched against the address.")
228 (defvar spam-whitelist-cache nil)
229 (defvar spam-blacklist-cache nil)
231 (defun spam-enter-whitelist (address)
232 "Enter ADDRESS into the whitelist."
233 (interactive "sAddress: ")
234 (spam-enter-list address spam-whitelist)
235 (setq spam-whitelist-cache nil))
237 (defun spam-enter-blacklist (address)
238 "Enter ADDRESS into the blacklist."
239 (interactive "sAddress: ")
240 (spam-enter-list address spam-blacklist)
241 (setq spam-blacklist-cache nil))
243 (defun spam-enter-list (address file)
244 "Enter ADDRESS into the given FILE, either the whitelist or the blacklist."
245 (unless (file-exists-p (file-name-directory file))
246 (make-directory (file-name-directory file) t))
249 (find-file-noselect file))
250 (goto-char (point-max))
253 (insert address "\n")
256 ;;; returns nil if the sender is in the whitelist, spam-split-group otherwise
257 (defun spam-check-whitelist ()
258 ;; FIXME! Should it detect when file timestamps change?
259 (unless spam-whitelist-cache
260 (setq spam-whitelist-cache (spam-parse-list spam-whitelist)))
261 (if (spam-from-listed-p spam-whitelist-cache) nil spam-split-group))
263 ;;; copied from code by Alexander Kotelnikov <sacha@giotto.sj.ru>
264 ;; FIXME: assumes that bbdb.el is loaded
265 ;; FIXME: broken right now, if the "from" field can't be retrieved
266 (defun spam-check-bbdb ()
267 "We want messages from people who are in the BBDB not to be split to spam"
268 (let ((who (message-fetch-field "from")))
270 (setq who (regexp-quote (cadr (gnus-extract-address-components who))))
271 (if (bbdb-search (bbdb-records) nil nil who) nil spam-split-group))))
273 (defun spam-check-blacklist ()
274 ;; FIXME! Should it detect when file timestamps change?
275 (unless spam-blacklist-cache
276 (setq spam-blacklist-cache (spam-parse-list spam-blacklist)))
277 (and (spam-from-listed-p spam-blacklist-cache) spam-split-group))
280 (defalias 'spam-point-at-eol (if (fboundp 'point-at-eol)
282 'line-end-position)))
284 (defun spam-parse-list (file)
285 (when (file-readable-p file)
286 (let (contents address)
288 (insert-file-contents file)
290 (setq address (buffer-substring (point) (spam-point-at-eol)))
292 (unless (zerop (length address))
293 (setq address (regexp-quote address))
294 (while (string-match "\\\\\\*" address)
295 (setq address (replace-match ".*" t t address)))
296 (push address contents))))
297 (nreverse contents))))
299 (defun spam-from-listed-p (cache)
300 (let ((from (message-fetch-field "from"))
303 (when (string-match (pop cache) from)
308 ;;;; Training via Bogofilter. Last updated 2002-09-02.
310 ;;; See Paul Graham article, at `http://www.paulgraham.com/spam.html'.
312 ;;; This page is for those wanting to control spam with the help of Eric
313 ;;; Raymond's speedy Bogofilter, see http://www.tuxedo.org/~esr/bogofilter.
314 ;;; This has been tested with a locally patched copy of version 0.4.
316 ;;; Make sure Bogofilter is installed. Bogofilter internally uses Judy fast
317 ;;; associative arrays, so you need to install Judy first, and Bogofilter
318 ;;; next. Fetch both distributions by visiting the following links and
319 ;;; downloading the latest version of each:
321 ;;; http://sourceforge.net/projects/judy/
322 ;;; http://www.tuxedo.org/~esr/bogofilter/
324 ;;; Unpack the Judy distribution and enter its main directory. Then do:
330 ;;; You will likely need to become super-user for the last step. Then, unpack
331 ;;; the Bogofilter distribution and enter its main directory:
336 ;;; Here as well, you need to become super-user for the last step. Now,
337 ;;; initialises your word lists by doing, under your own identity:
339 ;;; mkdir ~/.bogofilter
340 ;;; touch ~/.bogofilter/badlist
341 ;;; touch ~/.bogofilter/goodlist
343 ;;; These two files are text files you may edit, but you normally don't!
345 ;;; The `M-d' command gets added to Gnus summary mode, marking current article
346 ;;; as spam, showing it with the `H' mark. Whenever you see a spam article,
347 ;;; make sure to mark its summary line with `M-d' before leaving the group.
348 ;;; Some groups, as per variable `spam-junk-mailgroups' below, receive articles
349 ;;; from Gnus splitting on clues added by spam recognisers, so for these
350 ;;; groups, we tack an `H' mark at group entry for all summary lines which
351 ;;; would otherwise have no other mark. Make sure to _remove_ `H' marks for
352 ;;; any article which is _not_ genuine spam, before leaving such groups: you
353 ;;; may use `M-u' to "unread" the article, or `d' for declaring it read the
354 ;;; non-spam way. When you leave a group, all `H' marked articles, saved or
355 ;;; unsaved, are sent to Bogofilter which will study them as spam samples.
357 ;;; Messages may also be deleted in various other ways, and unless
358 ;;; `spam-ham-marks-form' gets overridden below, marks `R' and `r' for default
359 ;;; read or explicit delete, marks `X' and 'K' for automatic or explicit
360 ;;; kills, as well as mark `Y' for low scores, are all considered to be
361 ;;; associated with articles which are not spam. This assumption might be
362 ;;; false, in particular if you use kill files or score files as means for
363 ;;; detecting genuine spam, you should then adjust `spam-ham-marks-form'. When
364 ;;; you leave a group, all _unsaved_ articles bearing any the above marks are
365 ;;; sent to Bogofilter which will study these as not-spam samples. If you
366 ;;; explicit kill a lot, you might sometimes end up with articles marked `K'
367 ;;; which you never saw, and which might accidentally contain spam. Best is
368 ;;; to make sure that real spam is marked with `H', and nothing else.
370 ;;; All other marks do not contribute to Bogofilter pre-conditioning. In
371 ;;; particular, ticked, dormant or souped articles are likely to contribute
372 ;;; later, when they will get deleted for real, so there is no need to use
373 ;;; them prematurely. Explicitly expired articles do not contribute, command
374 ;;; `E' is a way to get rid of an article without Bogofilter ever seeing it.
376 ;;; In a word, with a minimum of care for associating the `H' mark for spam
377 ;;; articles only, Bogofilter training all gets fairly automatic. You should
378 ;;; do this until you get a few hundreds of articles in each category, spam
379 ;;; or not. The shell command `head -1 ~/.bogofilter/*' shows both article
380 ;;; counts. The command `S S' in summary mode, either for debugging or for
381 ;;; curiosity, triggers Bogofilter into displaying in another buffer the
382 ;;; "spamicity" score of the current article (between 0.0 and 1.0), together
383 ;;; with the article words which most significantly contribute to the score.
385 ;;; The real way for using Bogofilter, however, is to have some use tool like
386 ;;; `procmail' for invoking it on message reception, then adding some
387 ;;; recognisable header in case of detected spam. Gnus splitting rules might
388 ;;; later trip on these added headers and react by sorting such articles into
389 ;;; specific junk folders as per `spam-junk-mailgroups'. Here is a possible
390 ;;; `.procmailrc' contents (still untested -- please tell me how it goes):
394 ;;; | formail -bfI "X-Spam-Status: Yes"
396 (defvar spam-output-buffer-name "*Bogofilter Output*"
397 "Name of buffer when displaying `bogofilter -v' output.")
399 (defvar spam-spaminfo-header-regexp
400 ;; FIXME! In the following regexp, we should explain which tool produces
401 ;; which kind of header. I do not even remember them all by now. X-Junk
402 ;; (and previously X-NoSpam) are produced by the `NoSpam' tool, which has
403 ;; never been published, so it might not be reasonable leaving it in the
405 "^X-\\(jf\\|Junk\\|NoSpam\\|Spam\\|SB\\)[^:]*:"
406 "Regexp for spam markups in headers.
407 Markup from spam recognisers, as well as `Xref', are to be removed from
408 articles before they get registered by Bogofilter.")
410 ;; FIXME! I do not know if Gnus has a compatibility function for
411 ;; `executable-find'. Here is a possible mantra for portability,
412 ;; until Lars decides how we really should do it.
413 (unless (fboundp 'executable-find)
414 (if (fboundp 'locate-file)
415 (defun executable-find (command)
416 (locate-file command exec-path))
417 (autoload 'executable-find "executable")))
418 ;; End of portability mantra for `executable-find'.
420 (defvar spam-bogofilter-path (executable-find "bogofilter")
421 "File path of the Bogofilter executable program.
422 Force this variable to nil if you want to inhibit the functionality.")
424 (defun spam-check-bogofilter ()
425 ;; Dynamic spam check. I do not know how to check the exit status,
426 ;; so instead, read `bogofilter -v' output.
427 (when (and spam-use-bogofilter spam-bogofilter-path)
428 (spam-bogofilter-articles nil "-v" (list (gnus-summary-article-number)))
429 (when (save-excursion
430 (set-buffer spam-output-buffer-name)
431 (goto-char (point-min))
432 (re-search-forward "Spamicity: \\(0\\.9\\|1\\.0\\)" nil t))
435 (defun spam-bogofilter-score ()
436 "Use `bogofilter -v' on the current article.
437 This yields the 15 most discriminant words for this article and the
438 spamicity coefficient of each, and the overall article spamicity."
440 (when (and spam-use-bogofilter spam-bogofilter-path)
441 (spam-bogofilter-articles nil "-v" (list (gnus-summary-article-number)))
443 (set-buffer spam-output-buffer-name)
444 (unless (= (point-min) (point-max))
445 (display-message-or-buffer (current-buffer)
446 spam-output-buffer-name)))))
448 (defun spam-bogofilter-register-routine ()
449 (when (and spam-use-bogofilter spam-bogofilter-path)
450 (let ((articles gnus-newsgroup-articles)
451 article mark ham-articles spam-articles)
453 (setq article (pop articles)
454 mark (gnus-summary-article-mark article))
455 (cond ((memq mark spam-spam-marks) (push article spam-articles))
456 ((memq article gnus-newsgroup-saved))
457 ((memq mark spam-ham-marks) (push article ham-articles))))
459 (spam-bogofilter-articles "ham" "-n" ham-articles))
461 (spam-bogofilter-articles "SPAM" "-s" spam-articles)))))
463 (defvar spam-bogofilter-initial-timeout 40
464 "Timeout in seconds for the initial reply from the `bogofilter' program.")
466 (defvar spam-bogofilter-subsequent-timeout 15
467 "Timeout in seconds for any subsequent reply from the `bogofilter' program.")
469 (defun spam-bogofilter-articles (type option articles)
470 (let ((output-buffer (get-buffer-create spam-output-buffer-name))
471 (article-copy (get-buffer-create " *Bogofilter Article Copy*"))
472 (remove-regexp (concat spam-spaminfo-header-regexp "\\|Xref:"))
474 prefix process article)
476 (setq prefix (format "Studying %d articles as %s..." (length articles)
478 (message "%s" prefix))
479 (save-excursion (set-buffer output-buffer) (erase-buffer))
480 (setq process (start-process "bogofilter" output-buffer
481 spam-bogofilter-path "-F" option))
482 (process-kill-without-query process t)
484 (save-window-excursion
486 (setq counter (1+ counter))
488 (message "%s %d" prefix counter))
489 (setq article (pop articles))
490 (gnus-summary-goto-subject article)
491 (gnus-summary-select-article)
492 (gnus-eval-in-buffer-window article-copy
493 (insert-buffer-substring gnus-original-article-buffer)
494 ;; Remove spam classification redundant headers: they may induce
495 ;; unwanted biases in later analysis.
496 (goto-char (point-min))
497 (while (not (or (eobp) (= (following-char) ?\n)))
498 (if (looking-at remove-regexp)
499 (delete-region (point)
500 (save-excursion (forward-line 1) (point)))
502 (goto-char (point-min))
503 ;; Bogofilter really wants From envelopes for counting articles.
504 ;; Fake one at the beginning, make sure there will be no other.
505 (if (looking-at "From ")
507 (insert "From nobody " (current-time-string) "\n"))
508 (let (case-fold-search)
509 (while (re-search-forward "^From " nil t)
512 (process-send-region process (point-min) (point-max))
514 ;; Sending the EOF is unwind-protected. This is to prevent lost copies
515 ;; of `bogofilter', hung on reading their standard input, in case the
516 ;; whole registering process gets interrupted by the user.
517 (process-send-eof process))
518 (kill-buffer article-copy)
519 ;; Receive process output. It sadly seems that we still have to protect
520 ;; ourselves against hung `bogofilter' processes.
521 (let ((status (process-status process))
522 (timeout (* 1000 spam-bogofilter-initial-timeout))
523 (quanta 200)) ; also counted in milliseconds
524 (while (and (not (eq status 'exit)) (> timeout 0))
525 ;; `accept-process-output' timeout is counted in microseconds.
526 (setq timeout (if (accept-process-output process 0 (* 1000 quanta))
527 (* 1000 spam-bogofilter-subsequent-timeout)
529 status (process-status process)))
530 (if (eq status 'exit)
532 (message "%s done!" prefix))
533 ;; Sigh! The process did time out... Become brutal!
534 (interrupt-process process)
535 (message "%s %d INTERRUPTED! (Article %d, status %s)"
536 (or prefix "Bogofilter process...")
537 counter article status)
538 ;; Give some time for user to read. Sitting redisplays but gives up
539 ;; if input is pending. Sleeping does not give up, but it does not
540 ;; redisplay either. Mix both: let's redisplay and not give up.
546 ;;; spam.el ends here.