1 ;;; spam.el --- Identifying spam
2 ;; Copyright (C) 2002 Free Software Foundation, Inc.
4 ;; Author: Lars Magne Ingebrigtsen <larsi@gnus.org>
7 ;; This file is part of GNU Emacs.
9 ;; GNU Emacs is free software; you can redistribute it and/or modify
10 ;; it under the terms of the GNU General Public License as published by
11 ;; the Free Software Foundation; either version 2, or (at your option)
14 ;; GNU Emacs is distributed in the hope that it will be useful,
15 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;; GNU General Public License for more details.
19 ;; You should have received a copy of the GNU General Public License
20 ;; along with GNU Emacs; see the file COPYING. If not, write to the
21 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 ;; Boston, MA 02111-1307, USA.
26 ;;; This module addresses a few aspects of spam control under Gnus. Page
27 ;;; breaks are used for grouping declarations and documentation relating to
28 ;;; each particular aspect.
30 ;;; The integration with Gnus is not yet complete. See various `FIXME'
31 ;;; comments, below, for supplementary explanations or discussions.
33 ;;; Several TODO items are marked as such
39 (require 'gnus-uu) ; because of key prefix issues
40 (require 'gnus) ; for the definitions of group content classification and spam processors
42 ;; FIXME! We should not require `message' until we actually need
43 ;; them. Best would be to declare needed functions as auto-loadable.
46 ;; Attempt to load BBDB macros
50 (file-error (defalias 'bbdb-search 'ignore))))
52 ;; autoload executable-find
54 ;; executable-find is not autoloaded in Emacs 20
55 (autoload 'executable-find "executable"))
57 ;; autoload ifile-spam-filter
59 (autoload 'ifile-spam-filter "ifile-gnus"))
63 (autoload 'query-dig "dig"))
67 (autoload 'query-dns "dns"))
72 "Spam configuration.")
74 (defcustom spam-directory "~/News/spam/"
75 "Directory for spam whitelists and blacklists."
79 (defcustom spam-whitelist (expand-file-name "whitelist" spam-directory)
80 "The location of the whitelist.
81 The file format is one regular expression per line.
82 The regular expression is matched against the address."
86 (defcustom spam-blacklist (expand-file-name "blacklist" spam-directory)
87 "The location of the blacklist.
88 The file format is one regular expression per line.
89 The regular expression is matched against the address."
93 (defcustom spam-use-dig t
94 "Whether query-dig should be used instead of query-dns."
98 (defcustom spam-use-blacklist nil
99 "Whether the blacklist should be used by spam-split."
103 (defcustom spam-use-whitelist nil
104 "Whether the whitelist should be used by spam-split."
108 (defcustom spam-use-blackholes nil
109 "Whether blackholes should be used by spam-split."
113 (defcustom spam-use-bogofilter nil
114 "Whether bogofilter should be used by spam-split."
118 (defcustom spam-use-BBDB nil
119 "Whether BBDB should be used by spam-split."
123 (defcustom spam-use-ifile nil
124 "Whether ifile should be used by spam-split."
128 (defcustom spam-split-group "spam"
129 "Group name where incoming spam should be put by spam-split."
133 ;; FIXME! The mailgroup list evidently depends on other choices made by the
134 ;; user, so the built-in default below is not likely to be appropriate.
135 (defcustom spam-junk-mailgroups (cons spam-split-group '("mail.junk" "poste.pourriel"))
136 "Mailgroups with spam contents.
137 All unmarked article in such group receive the spam mark on group entry."
138 :type '(repeat (string :tag "Group"))
141 (defcustom spam-blackhole-servers '("bl.spamcop.net" "relays.ordb.org" "dev.null.dk" "relays.visi.com")
142 "List of blackhole servers."
143 :type '(repeat (string :tag "Server"))
146 (defcustom spam-ham-marks (list 'gnus-del-mark 'gnus-read-mark 'gnus-killed-mark 'gnus-kill-file-mark 'gnus-low-score-mark)
147 "Marks considered as being ham (positively not spam).
148 Such articles will be processed as ham (non-spam) on group exit."
150 (variable-item gnus-del-mark)
151 (variable-item gnus-read-mark)
152 (variable-item gnus-killed-mark)
153 (variable-item gnus-kill-file-mark)
154 (variable-item gnus-low-score-mark))
157 (defcustom spam-spam-marks (list 'gnus-spam-mark)
158 "Marks considered as being spam (positively spam).
159 Such articles will be transmitted to `bogofilter -s' on group exit."
161 (variable-item gnus-spam-mark)
162 (variable-item gnus-killed-mark)
163 (variable-item gnus-kill-file-mark)
164 (variable-item gnus-low-score-mark))
167 (defcustom spam-face 'gnus-splash-face
168 "Face for spam-marked articles"
172 (defgroup spam-bogofilter nil
173 "Spam bogofilter configuration."
176 (defcustom spam-bogofilter-output-buffer-name "*Bogofilter Output*"
177 "Name of buffer when displaying `bogofilter -v' output."
179 :group 'spam-bogofilter)
181 (defcustom spam-bogofilter-initial-timeout 40
182 "Timeout in seconds for the initial reply from the `bogofilter' program."
184 :group 'spam-bogofilter)
186 (defcustom spam-bogofilter-subsequent-timeout 15
187 "Timeout in seconds for any subsequent reply from the `bogofilter' program."
189 :group 'spam-bogofilter)
191 (defcustom spam-bogofilter-path (executable-find "bogofilter")
192 "File path of the Bogofilter executable program."
193 :type '(choice (file :tag "Location of bogofilter")
194 (const :tag "Bogofilter is not installed"))
195 :group 'spam-bogofilter)
197 ;; FIXME! In the following regexp, we should explain which tool produces
198 ;; which kind of header. I do not even remember them all by now. X-Junk
199 ;; (and previously X-NoSpam) are produced by the `NoSpam' tool, which has
200 ;; never been published, so it might not be reasonable leaving it in the
202 (defcustom spam-bogofilter-spaminfo-header-regexp "^X-\\(jf\\|Junk\\|NoSpam\\|Spam\\|SB\\)[^:]*:"
203 "Regexp for spam markups in headers.
204 Markup from spam recognisers, as well as `Xref', are to be removed from
205 articles before they get registered by Bogofilter."
207 :group 'spam-bogofilter)
209 ;;; Key bindings for spam control.
211 (gnus-define-keys gnus-summary-mode-map
212 "St" spam-bogofilter-score
213 "Sx" gnus-summary-mark-as-spam
214 "Mst" spam-bogofilter-score
215 "Msx" gnus-summary-mark-as-spam
216 "\M-d" gnus-summary-mark-as-spam)
218 ;;; How to highlight a spam summary line.
220 ;; TODO: How do we redo this every time spam-face is customized?
222 (push '((eq mark gnus-spam-mark) . spam-face)
223 gnus-summary-highlight)
225 ;; convenience functions
226 (defun spam-group-spam-contents-p (group)
228 (or (member group spam-junk-mailgroups)
229 (memq 'gnus-group-spam-classification-spam (gnus-parameter-spam-contents group)))
232 (defun spam-group-ham-contents-p (group)
234 (memq 'gnus-group-spam-classification-ham (gnus-parameter-spam-contents group))
237 (defun spam-group-processor-p (group processor)
238 (if (and (stringp group)
240 (member processor (car (gnus-parameter-spam-process group)))
243 (defun spam-group-processor-bogofilter-p (group)
244 (spam-group-processor-p group 'gnus-group-spam-exit-processor-bogofilter))
246 (defun spam-group-processor-ifile-p (group)
247 (spam-group-processor-p group 'gnus-group-spam-exit-processor-ifile))
249 (defun spam-group-processor-blacklist-p (group)
250 (spam-group-processor-p group 'gnus-group-spam-exit-processor-blacklist))
252 (defun spam-group-processor-whitelist-p (group)
253 (spam-group-processor-p group 'gnus-group-ham-exit-processor-whitelist))
255 (defun spam-group-processor-BBDB-p (group)
256 (spam-group-processor-p group 'gnus-group-ham-exit-processor-BBDB))
258 ;;; Hooks dispatching. A bit raw for now.
260 (defun spam-summary-prepare ()
261 (spam-mark-junk-as-spam-routine))
263 (defun spam-summary-prepare-exit ()
264 ;; The spam processors are invoked for any group, spam or ham or neither
265 (when (and spam-bogofilter-path
266 (spam-group-processor-bogofilter-p gnus-newsgroup-name))
267 (spam-bogofilter-register-routine))
269 (when (spam-group-processor-ifile-p gnus-newsgroup-name)
270 (spam-ifile-register-routine))
272 (when (spam-group-processor-bogofilter-p gnus-newsgroup-name)
273 (spam-blacklist-register-routine))
275 ;; Only for spam groups, we expire and maybe move articles
276 (when (spam-group-spam-contents-p gnus-newsgroup-name)
277 (spam-mark-spam-as-expired-and-move-routine (gnus-parameter-spam-process-destination gnus-newsgroup-name)))
279 (when (spam-group-ham-contents-p gnus-newsgroup-name)
280 (when (spam-group-processor-whitelist-p gnus-newsgroup-name)
281 (spam-whitelist-register-routine))
282 (when (spam-group-processor-BBDB-p gnus-newsgroup-name)
283 (spam-BBDB-register-routine))))
285 (add-hook 'gnus-summary-prepare-hook 'spam-summary-prepare)
286 (add-hook 'gnus-summary-prepare-exit-hook 'spam-summary-prepare-exit)
288 (defun spam-mark-junk-as-spam-routine ()
289 ;; check the global list of group names spam-junk-mailgroups and the group parameters
290 (when (spam-group-spam-contents-p gnus-newsgroup-name)
291 (let ((articles gnus-newsgroup-articles)
294 (setq article (pop articles))
295 (when (eq (gnus-summary-article-mark article) gnus-unread-mark)
296 (gnus-summary-mark-article article gnus-spam-mark))))))
298 (defun spam-mark-spam-as-expired-and-move-routine (&optional group)
299 (let ((articles gnus-newsgroup-articles)
302 (setq article (pop articles))
303 (when (eq (gnus-summary-article-mark article) gnus-spam-mark)
304 (gnus-summary-mark-article article gnus-expirable-mark)
305 (when (stringp group)
306 (let ((gnus-current-article article))
307 (gnus-summary-move-article nil group)))))))
309 (defun spam-generic-register-routine (spam-func ham-func)
310 (let ((articles gnus-newsgroup-articles)
311 article mark ham-articles spam-articles spam-mark-values ham-mark-values)
313 ;; marks are stored as symbolic values, so we have to dereference them for memq to work
314 ;; we wouldn't have to do this if gnus-summary-article-mark returned a symbol.
315 (dolist (mark spam-ham-marks)
316 (push (symbol-value mark) ham-mark-values))
318 (dolist (mark spam-spam-marks)
319 (push (symbol-value mark) spam-mark-values))
322 (setq article (pop articles)
323 mark (gnus-summary-article-mark article))
324 (cond ((memq mark spam-mark-values) (push article spam-articles))
325 ((memq article gnus-newsgroup-saved))
326 ((memq mark ham-mark-values) (push article ham-articles))))
327 (when (and ham-articles ham-func)
328 (mapc ham-func ham-articles)) ; we use mapc because unlike mapcar it discards the return values
329 (when (and spam-articles spam-func)
330 (mapc spam-func spam-articles)))) ; we use mapc because unlike mapcar it discards the return values
332 (defun spam-fetch-field-from-fast (article)
333 "Fetch the `from' field quickly, using the Gnus internal gnus-data-list function"
334 (if (and (numberp article)
335 (assoc article (gnus-data-list nil)))
336 (mail-header-from (gnus-data-header (assoc article (gnus-data-list nil))))
340 ;;;; Spam determination.
342 (defvar spam-list-of-checks
343 '((spam-use-blacklist . spam-check-blacklist)
344 (spam-use-whitelist . spam-check-whitelist)
345 (spam-use-BBDB . spam-check-BBDB)
346 (spam-use-ifile . spam-check-ifile)
347 (spam-use-blackholes . spam-check-blackholes)
348 (spam-use-bogofilter . spam-check-bogofilter))
349 "The spam-list-of-checks list contains pairs associating a parameter
350 variable with a spam checking function. If the parameter variable is
351 true, then the checking function is called, and its value decides what
352 happens. Each individual check may return `nil', `t', or a mailgroup
353 name. The value `nil' means that the check does not yield a decision,
354 and so, that further checks are needed. The value `t' means that the
355 message is definitely not spam, and that further spam checks should be
356 inhibited. Otherwise, a mailgroup name is returned where the mail
357 should go, and further checks are also inhibited. The usual mailgroup
358 name is the value of `spam-split-group', meaning that the message is
362 "Split this message into the `spam' group if it is spam.
363 This function can be used as an entry in `nnmail-split-fancy', for
364 example like this: (: spam-split)
366 See the Info node `(gnus)Fancy Mail Splitting' for more details."
369 (let ((list-of-checks spam-list-of-checks)
371 (while (and list-of-checks (not decision))
372 (let ((pair (pop list-of-checks)))
373 (when (symbol-value (car pair))
374 (setq decision (funcall (cdr pair))))))
381 (defun spam-check-blackholes ()
382 "Check the Received headers for blackholed relays."
383 (let ((headers (message-fetch-field "received"))
388 (goto-char (point-min))
389 (while (re-search-forward
390 "\\[\\([0-9]+.[0-9]+.[0-9]+.[0-9]+\\)\\]" nil t)
391 (message "Blackhole search found host IP %s." (match-string 1))
392 (push (mapconcat 'identity
393 (nreverse (split-string (match-string 1) "\\."))
396 (dolist (server spam-blackhole-servers)
398 (let ((query-string (concat ip "." server)))
400 (let ((query-result (query-dig query-string)))
402 (message "spam detected with blackhole check of relay %s (dig query result '%s')" query-string query-result)
403 (push (list ip server query-result)
405 ;; else, if not using dig.el
406 (when (query-dns query-string)
407 (push (list ip server (query-dns query-string 'TXT))
413 ;;; original idea for spam-check-BBDB from Alexander Kotelnikov <sacha@giotto.sj.ru>
415 ;; all this is done inside a condition-case to trap errors
421 (defun spam-enter-ham-BBDB (from)
422 "Enter an address into the BBDB; implies ham (non-spam) sender"
424 (let* ((parsed-address (gnus-extract-address-components from))
425 (name (or (car parsed-address) "Ham Sender"))
426 (net-address (car (cdr parsed-address))))
427 (message "Adding address %s to BBDB" from)
428 (when (and net-address
429 (not (bbdb-search (bbdb-records) nil nil net-address)))
430 (bbdb-create-internal name nil net-address nil nil "ham sender added by spam.el")))))
432 (defun spam-BBDB-register-routine ()
433 (spam-generic-register-routine
438 (spam-enter-ham-BBDB (spam-fetch-field-from-fast article)))))
440 (defun spam-check-BBDB ()
441 "Mail from people in the BBDB is never considered spam"
442 (let ((who (message-fetch-field "from")))
444 (setq who (regexp-quote (cadr (gnus-extract-address-components who))))
445 (if (bbdb-search (bbdb-records) nil nil who) nil spam-split-group)))))
448 (setq spam-list-of-checks
449 (delete (assoc 'spam-use-BBDB spam-list-of-checks)
450 spam-list-of-checks))
451 (defun spam-check-BBDB ()
452 message "spam-check-BBDB was invoked, but it shouldn't have")
453 (defun spam-BBDB-register-routine ()
454 (spam-generic-register-routine nil nil)))))
458 ;;; uses ifile-gnus.el from http://www.ai.mit.edu/people/jhbrown/ifile-gnus.html
459 ;;; check the ifile backend; return nil if the mail was NOT classified as spam
460 ;;; TODO: we can't (require 'ifile-gnus), because it will insinuate itself automatically
461 (defun spam-check-ifile ()
462 (let ((ifile-primary-spam-group spam-split-group))
463 (ifile-spam-filter nil)))
465 ;; TODO: add ifile registration
466 ;; We need ifile-gnus.el to support nnimap; we could feel the message
467 ;; directly to ifile like we do with bogofilter but that's ugly.
468 (defun spam-ifile-register-routine ()
469 (spam-generic-register-routine nil nil))
472 ;;;; Blacklists and whitelists.
474 (defvar spam-whitelist-cache nil)
475 (defvar spam-blacklist-cache nil)
477 (defun spam-enter-whitelist (address)
478 "Enter ADDRESS into the whitelist."
479 (interactive "sAddress: ")
480 (spam-enter-list address spam-whitelist)
481 (setq spam-whitelist-cache nil))
483 (defun spam-enter-blacklist (address)
484 "Enter ADDRESS into the blacklist."
485 (interactive "sAddress: ")
486 (spam-enter-list address spam-blacklist)
487 (setq spam-blacklist-cache nil))
489 (defun spam-enter-list (address file)
490 "Enter ADDRESS into the given FILE, either the whitelist or the blacklist."
491 (unless (file-exists-p (file-name-directory file))
492 (make-directory (file-name-directory file) t))
495 (find-file-noselect file))
496 (goto-char (point-max))
499 (insert address "\n")
502 ;;; returns nil if the sender is in the whitelist, spam-split-group otherwise
503 (defun spam-check-whitelist ()
504 ;; FIXME! Should it detect when file timestamps change?
505 (unless spam-whitelist-cache
506 (setq spam-whitelist-cache (spam-parse-list spam-whitelist)))
507 (if (spam-from-listed-p spam-whitelist-cache) nil spam-split-group))
509 (defun spam-check-blacklist ()
510 ;; FIXME! Should it detect when file timestamps change?
511 (unless spam-blacklist-cache
512 (setq spam-blacklist-cache (spam-parse-list spam-blacklist)))
513 (and (spam-from-listed-p spam-blacklist-cache) spam-split-group))
516 (defalias 'spam-point-at-eol (if (fboundp 'point-at-eol)
518 'line-end-position)))
520 (defun spam-parse-list (file)
521 (when (file-readable-p file)
522 (let (contents address)
524 (insert-file-contents file)
526 (setq address (buffer-substring (point) (spam-point-at-eol)))
528 (unless (zerop (length address))
529 (setq address (regexp-quote address))
530 (while (string-match "\\\\\\*" address)
531 (setq address (replace-match ".*" t t address)))
532 (push address contents))))
533 (nreverse contents))))
535 (defun spam-from-listed-p (cache)
536 (let ((from (message-fetch-field "from"))
539 (when (string-match (pop cache) from)
544 (defun spam-blacklist-register-routine ()
545 (spam-generic-register-routine
548 (let ((from (spam-fetch-field-from-fast article)))
550 (spam-enter-blacklist from))))
554 (defun spam-whitelist-register-routine ()
555 (spam-generic-register-routine
560 (let ((from (spam-fetch-field-from-fast article)))
562 (spam-enter-whitelist from))))))
567 ;;; See Paul Graham article, at `http://www.paulgraham.com/spam.html'.
569 ;;; This page is for those wanting to control spam with the help of Eric
570 ;;; Raymond's speedy Bogofilter, see http://www.tuxedo.org/~esr/bogofilter.
571 ;;; This has been tested with a locally patched copy of version 0.4.
573 ;;; Make sure Bogofilter is installed. Bogofilter internally uses Judy fast
574 ;;; associative arrays, so you need to install Judy first, and Bogofilter
575 ;;; next. Fetch both distributions by visiting the following links and
576 ;;; downloading the latest version of each:
578 ;;; http://sourceforge.net/projects/judy/
579 ;;; http://www.tuxedo.org/~esr/bogofilter/
581 ;;; Unpack the Judy distribution and enter its main directory. Then do:
587 ;;; You will likely need to become super-user for the last step. Then, unpack
588 ;;; the Bogofilter distribution and enter its main directory:
593 ;;; Here as well, you need to become super-user for the last step. Now,
594 ;;; initialize your word lists by doing, under your own identity:
596 ;;; mkdir ~/.bogofilter
597 ;;; touch ~/.bogofilter/badlist
598 ;;; touch ~/.bogofilter/goodlist
600 ;;; These two files are text files you may edit, but you normally don't!
602 ;;; The `M-d' command gets added to Gnus summary mode, marking current article
603 ;;; as spam, showing it with the `H' mark. Whenever you see a spam article,
604 ;;; make sure to mark its summary line with `M-d' before leaving the group.
605 ;;; Some groups, as per variable `spam-junk-mailgroups' below, receive articles
606 ;;; from Gnus splitting on clues added by spam recognisers, so for these
607 ;;; groups, we tack an `H' mark at group entry for all summary lines which
608 ;;; would otherwise have no other mark. Make sure to _remove_ `H' marks for
609 ;;; any article which is _not_ genuine spam, before leaving such groups: you
610 ;;; may use `M-u' to "unread" the article, or `d' for declaring it read the
611 ;;; non-spam way. When you leave a group, all `H' marked articles, saved or
612 ;;; unsaved, are sent to Bogofilter which will study them as spam samples.
614 ;;; Messages may also be deleted in various other ways, and unless
615 ;;; `spam-ham-marks-form' gets overridden below, marks `R' and `r' for default
616 ;;; read or explicit delete, marks `X' and 'K' for automatic or explicit
617 ;;; kills, as well as mark `Y' for low scores, are all considered to be
618 ;;; associated with articles which are not spam. This assumption might be
619 ;;; false, in particular if you use kill files or score files as means for
620 ;;; detecting genuine spam, you should then adjust `spam-ham-marks-form'. When
621 ;;; you leave a group, all _unsaved_ articles bearing any the above marks are
622 ;;; sent to Bogofilter which will study these as not-spam samples. If you
623 ;;; explicit kill a lot, you might sometimes end up with articles marked `K'
624 ;;; which you never saw, and which might accidentally contain spam. Best is
625 ;;; to make sure that real spam is marked with `H', and nothing else.
627 ;;; All other marks do not contribute to Bogofilter pre-conditioning. In
628 ;;; particular, ticked, dormant or souped articles are likely to contribute
629 ;;; later, when they will get deleted for real, so there is no need to use
630 ;;; them prematurely. Explicitly expired articles do not contribute, command
631 ;;; `E' is a way to get rid of an article without Bogofilter ever seeing it.
633 ;;; In a word, with a minimum of care for associating the `H' mark for spam
634 ;;; articles only, Bogofilter training all gets fairly automatic. You should
635 ;;; do this until you get a few hundreds of articles in each category, spam
636 ;;; or not. The shell command `head -1 ~/.bogofilter/*' shows both article
637 ;;; counts. The command `S S' in summary mode, either for debugging or for
638 ;;; curiosity, triggers Bogofilter into displaying in another buffer the
639 ;;; "spamicity" score of the current article (between 0.0 and 1.0), together
640 ;;; with the article words which most significantly contribute to the score.
642 ;;; The real way for using Bogofilter, however, is to have some use tool like
643 ;;; `procmail' for invoking it on message reception, then adding some
644 ;;; recognisable header in case of detected spam. Gnus splitting rules might
645 ;;; later trip on these added headers and react by sorting such articles into
646 ;;; specific junk folders as per `spam-junk-mailgroups'. Here is a possible
647 ;;; `.procmailrc' contents (still untested -- please tell me how it goes):
651 ;;; | formail -bfI "X-Spam-Status: Yes"
653 (defun spam-check-bogofilter ()
654 ;; Dynamic spam check. I do not know how to check the exit status,
655 ;; so instead, read `bogofilter -v' output.
656 (when (and spam-use-bogofilter spam-bogofilter-path)
657 (spam-bogofilter-articles nil "-v" (list (gnus-summary-article-number)))
658 (when (save-excursion
659 (set-buffer spam-bogofilter-output-buffer-name)
660 (goto-char (point-min))
661 (re-search-forward "Spamicity: \\(0\\.9\\|1\\.0\\)" nil t))
664 (defun spam-bogofilter-score ()
665 "Use `bogofilter -v' on the current article.
666 This yields the 15 most discriminant words for this article and the
667 spamicity coefficient of each, and the overall article spamicity."
669 (when (and spam-use-bogofilter spam-bogofilter-path)
670 (spam-bogofilter-articles nil "-v" (list (gnus-summary-article-number)))
671 (with-current-buffer spam-bogofilter-output-buffer-name
672 (unless (zerop (buffer-size))
673 (if (<= (count-lines (point-min) (point-max)) 1)
675 (goto-char (point-max))
678 (message "%s" (buffer-substring (point-min) (point))))
679 (goto-char (point-min))
680 (display-buffer (current-buffer)))))))
682 (defun spam-bogofilter-register-routine ()
683 (let ((articles gnus-newsgroup-articles)
684 article mark ham-articles spam-articles spam-mark-values ham-mark-values)
686 ;; marks are stored as symbolic values, so we have to dereference them for memq to work
687 ;; we wouldn't have to do this if gnus-summary-article-mark returned a symbol.
688 (dolist (mark spam-ham-marks)
689 (push (symbol-value mark) ham-mark-values))
691 (dolist (mark spam-spam-marks)
692 (push (symbol-value mark) spam-mark-values))
695 (setq article (pop articles)
696 mark (gnus-summary-article-mark article))
697 (cond ((memq mark spam-mark-values) (push article spam-articles))
698 ((memq article gnus-newsgroup-saved))
699 ((memq mark ham-mark-values) (push article ham-articles))))
701 (spam-bogofilter-articles "ham" "-n" ham-articles))
703 (spam-bogofilter-articles "SPAM" "-s" spam-articles))))
705 (defun spam-bogofilter-articles (type option articles)
706 (let ((output-buffer (get-buffer-create spam-bogofilter-output-buffer-name))
707 (article-copy (get-buffer-create " *Bogofilter Article Copy*"))
708 (remove-regexp (concat spam-bogofilter-spaminfo-header-regexp "\\|Xref:"))
710 prefix process article)
712 (setq prefix (format "Studying %d articles as %s..." (length articles)
714 (message "%s" prefix))
715 (save-excursion (set-buffer output-buffer) (erase-buffer))
716 (setq process (start-process "bogofilter" output-buffer
717 spam-bogofilter-path "-F" option))
718 (process-kill-without-query process t)
720 (save-window-excursion
722 (setq counter (1+ counter))
724 (message "%s %d" prefix counter))
725 (setq article (pop articles))
726 (gnus-summary-goto-subject article)
727 (gnus-summary-show-article t)
728 (gnus-eval-in-buffer-window article-copy
729 (insert-buffer-substring gnus-original-article-buffer)
730 ;; Remove spam classification redundant headers: they may induce
731 ;; unwanted biases in later analysis.
732 (message-remove-header remove-regexp t)
733 ;; Bogofilter really wants From envelopes for counting articles.
734 ;; Fake one at the beginning, make sure there will be no other.
735 (goto-char (point-min))
736 (if (looking-at "From ")
738 (insert "From nobody " (current-time-string) "\n"))
739 (let (case-fold-search)
740 (while (re-search-forward "^From " nil t)
743 (process-send-region process (point-min) (point-max))
745 ;; Sending the EOF is unwind-protected. This is to prevent lost copies
746 ;; of `bogofilter', hung on reading their standard input, in case the
747 ;; whole registering process gets interrupted by the user.
748 (process-send-eof process))
749 (kill-buffer article-copy)
750 ;; Receive process output. It sadly seems that we still have to protect
751 ;; ourselves against hung `bogofilter' processes.
752 (let ((status (process-status process))
753 (timeout (* 1000 spam-bogofilter-initial-timeout))
754 (quanta 200)) ; also counted in milliseconds
755 (while (and (not (eq status 'exit)) (> timeout 0))
756 ;; `accept-process-output' timeout is counted in microseconds.
757 (setq timeout (if (accept-process-output process 0 (* 1000 quanta))
758 (* 1000 spam-bogofilter-subsequent-timeout)
760 status (process-status process)))
761 (if (eq status 'exit)
763 (message "%s done!" prefix))
764 ;; Sigh! The process did time out... Become brutal!
765 (interrupt-process process)
766 (message "%s %d INTERRUPTED! (Article %d, status %s)"
767 (or prefix "Bogofilter process...")
768 counter article status)
769 ;; Give some time for user to read. Sitting redisplays but gives up
770 ;; if input is pending. Sleeping does not give up, but it does not
771 ;; redisplay either. Mix both: let's redisplay and not give up.
777 ;;; spam.el ends here.