1 ;;; spam.el --- Identifying spam
2 ;; Copyright (C) 2002 Free Software Foundation, Inc.
4 ;; Author: Lars Magne Ingebrigtsen <larsi@gnus.org>
7 ;; This file is part of GNU Emacs.
9 ;; GNU Emacs is free software; you can redistribute it and/or modify
10 ;; it under the terms of the GNU General Public License as published by
11 ;; the Free Software Foundation; either version 2, or (at your option)
14 ;; GNU Emacs is distributed in the hope that it will be useful,
15 ;; but WITHOUT ANY WARRANTY; without even the implied warranty of
16 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 ;; GNU General Public License for more details.
19 ;; You should have received a copy of the GNU General Public License
20 ;; along with GNU Emacs; see the file COPYING. If not, write to the
21 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
22 ;; Boston, MA 02111-1307, USA.
26 ;;; This module addresses a few aspects of spam control under Gnus. Page
27 ;;; breaks are used for grouping declarations and documentation relating to
28 ;;; each particular aspect.
30 ;;; The integration with Gnus is not yet complete. See various `FIXME'
31 ;;; comments, below, for supplementary explanations or discussions.
33 ;;; Several TODO items are marked as such
39 (require 'gnus-uu) ; because of key prefix issues
40 (require 'gnus) ; for the definitions of group content classification and spam processors
41 (require 'message) ;for the message-fetch-field functions
43 ;; autoload executable-find
45 ;; executable-find is not autoloaded in Emacs 20
46 (autoload 'executable-find "executable"))
50 (autoload 'query-dig "dig"))
54 (autoload 'query-dns "dns"))
59 "Spam configuration.")
61 (defcustom spam-directory "~/News/spam/"
62 "Directory for spam whitelists and blacklists."
66 (defcustom spam-move-spam-nonspam-groups-only t
67 "Whether spam should be moved in non-spam groups only.
68 When nil, only ham and unclassified groups will have their spam moved
69 to the spam-process-destination. When t, spam will also be moved from
74 (defcustom spam-whitelist (expand-file-name "whitelist" spam-directory)
75 "The location of the whitelist.
76 The file format is one regular expression per line.
77 The regular expression is matched against the address."
81 (defcustom spam-blacklist (expand-file-name "blacklist" spam-directory)
82 "The location of the blacklist.
83 The file format is one regular expression per line.
84 The regular expression is matched against the address."
88 (defcustom spam-use-dig t
89 "Whether query-dig should be used instead of query-dns."
93 (defcustom spam-use-blacklist nil
94 "Whether the blacklist should be used by spam-split."
98 (defcustom spam-use-whitelist nil
99 "Whether the whitelist should be used by spam-split."
103 (defcustom spam-use-blackholes nil
104 "Whether blackholes should be used by spam-split."
108 (defcustom spam-use-bogofilter nil
109 "Whether bogofilter should be used by spam-split."
113 (defcustom spam-use-BBDB nil
114 "Whether BBDB should be used by spam-split."
118 (defcustom spam-use-ifile nil
119 "Whether ifile should be used by spam-split."
123 (defcustom spam-split-group "spam"
124 "Group name where incoming spam should be put by spam-split."
128 (defcustom spam-junk-mailgroups (cons spam-split-group '("mail.junk" "poste.pourriel"))
129 "Mailgroups with spam contents.
130 All unmarked article in such group receive the spam mark on group entry."
131 :type '(repeat (string :tag "Group"))
134 (defcustom spam-blackhole-servers '("bl.spamcop.net" "relays.ordb.org"
135 "dev.null.dk" "relays.visi.com")
136 "List of blackhole servers."
137 :type '(repeat (string :tag "Server"))
140 (defcustom spam-ham-marks (list 'gnus-del-mark 'gnus-read-mark
141 'gnus-killed-mark 'gnus-kill-file-mark
142 'gnus-low-score-mark)
143 "Marks considered as being ham (positively not spam).
144 Such articles will be processed as ham (non-spam) on group exit."
146 (variable-item gnus-del-mark)
147 (variable-item gnus-read-mark)
148 (variable-item gnus-killed-mark)
149 (variable-item gnus-kill-file-mark)
150 (variable-item gnus-low-score-mark))
153 (defcustom spam-spam-marks (list 'gnus-spam-mark)
154 "Marks considered as being spam (positively spam).
155 Such articles will be transmitted to `bogofilter -s' on group exit."
157 (variable-item gnus-spam-mark)
158 (variable-item gnus-killed-mark)
159 (variable-item gnus-kill-file-mark)
160 (variable-item gnus-low-score-mark))
163 (defcustom spam-face 'gnus-splash-face
164 "Face for spam-marked articles"
168 (defgroup spam-ifile nil
169 "Spam ifile configuration."
172 (defcustom spam-ifile-path (executable-find "ifile")
173 "File path of the ifile executable program."
174 :type '(choice (file :tag "Location of ifile")
175 (const :tag "ifile is not installed"))
178 (defcustom spam-ifile-database-path nil
179 "File path of the ifile database."
180 :type '(choice (file :tag "Location of the ifile database")
181 (const :tag "Use the default"))
184 (defcustom spam-ifile-spam-category "spam"
185 "Name of the spam ifile category."
189 (defcustom spam-ifile-all-categories nil
190 "Whether the ifile check will return all categories, or just spam.
191 Set this to t if you want to use the spam-split invocation of ifile as
192 your main source of newsgroup names."
196 (defgroup spam-bogofilter nil
197 "Spam bogofilter configuration."
200 (defcustom spam-bogofilter-output-buffer-name "*Bogofilter Output*"
201 "Name of buffer when displaying `bogofilter -v' output."
203 :group 'spam-bogofilter)
205 (defcustom spam-bogofilter-initial-timeout 40
206 "Timeout in seconds for the initial reply from the `bogofilter' program."
208 :group 'spam-bogofilter)
210 (defcustom spam-bogofilter-subsequent-timeout 15
211 "Timeout in seconds for any subsequent reply from the `bogofilter' program."
213 :group 'spam-bogofilter)
215 (defcustom spam-bogofilter-path (executable-find "bogofilter")
216 "File path of the Bogofilter executable program."
217 :type '(choice (file :tag "Location of bogofilter")
218 (const :tag "Bogofilter is not installed"))
219 :group 'spam-bogofilter)
221 ;; FIXME! In the following regexp, we should explain which tool produces
222 ;; which kind of header. I do not even remember them all by now. X-Junk
223 ;; (and previously X-NoSpam) are produced by the `NoSpam' tool, which has
224 ;; never been published, so it might not be reasonable leaving it in the
226 (defcustom spam-bogofilter-spaminfo-header-regexp
227 "^X-\\(jf\\|Junk\\|NoSpam\\|Spam\\|SB\\)[^:]*:"
228 "Regexp for spam markups in headers.
229 Markup from spam recognisers, as well as `Xref', are to be removed from
230 articles before they get registered by Bogofilter."
232 :group 'spam-bogofilter)
234 ;;; Key bindings for spam control.
236 (gnus-define-keys gnus-summary-mode-map
237 "St" spam-bogofilter-score
238 "Sx" gnus-summary-mark-as-spam
239 "Mst" spam-bogofilter-score
240 "Msx" gnus-summary-mark-as-spam
241 "\M-d" gnus-summary-mark-as-spam)
243 ;;; How to highlight a spam summary line.
245 ;; TODO: How do we redo this every time spam-face is customized?
247 (push '((eq mark gnus-spam-mark) . spam-face)
248 gnus-summary-highlight)
250 ;; convenience functions
251 (defun spam-group-spam-contents-p (group)
253 (or (member group spam-junk-mailgroups)
254 (memq 'gnus-group-spam-classification-spam
255 (gnus-parameter-spam-contents group)))
258 (defun spam-group-ham-contents-p (group)
260 (memq 'gnus-group-spam-classification-ham
261 (gnus-parameter-spam-contents group))
264 (defun spam-group-processor-p (group processor)
265 (if (and (stringp group)
267 (member processor (car (gnus-parameter-spam-process group)))
270 (defun spam-group-spam-processor-bogofilter-p (group)
271 (spam-group-processor-p group 'gnus-group-spam-exit-processor-bogofilter))
273 (defun spam-group-spam-processor-blacklist-p (group)
274 (spam-group-processor-p group 'gnus-group-spam-exit-processor-blacklist))
276 (defun spam-group-spam-processor-ifile-p (group)
277 (spam-group-processor-p group 'gnus-group-spam-exit-processor-ifile))
279 (defun spam-group-ham-processor-ifile-p (group)
280 (spam-group-processor-p group 'gnus-group-ham-exit-processor-ifile))
282 (defun spam-group-ham-processor-whitelist-p (group)
283 (spam-group-processor-p group 'gnus-group-ham-exit-processor-whitelist))
285 (defun spam-group-ham-processor-BBDB-p (group)
286 (spam-group-processor-p group 'gnus-group-ham-exit-processor-BBDB))
288 ;;; Summary entry and exit processing.
290 (defun spam-summary-prepare ()
291 (spam-mark-junk-as-spam-routine))
293 (add-hook 'gnus-summary-prepare-hook 'spam-summary-prepare)
295 (defun spam-summary-prepare-exit ()
296 ;; The spam processors are invoked for any group, spam or ham or neither
297 (when (and spam-bogofilter-path
298 (spam-group-spam-processor-bogofilter-p gnus-newsgroup-name))
299 (spam-bogofilter-register-routine))
301 (when (and spam-ifile-path
302 (spam-group-spam-processor-ifile-p gnus-newsgroup-name))
303 (spam-ifile-register-spam-routine))
305 (when (spam-group-spam-processor-bogofilter-p gnus-newsgroup-name)
306 (spam-blacklist-register-routine))
308 (if spam-move-spam-nonspam-groups-only
309 (when (not (spam-group-spam-contents-p gnus-newsgroup-name))
310 (spam-mark-spam-as-expired-and-move-routine
311 (gnus-parameter-spam-process-destination gnus-newsgroup-name)))
312 (spam-mark-spam-as-expired-and-move-routine
313 (gnus-parameter-spam-process-destination gnus-newsgroup-name)))
315 ;; now we redo spam-mark-spam-as-expired-and-move-routine to only
316 ;; expire spam, in case the above did not expire them
317 (spam-mark-spam-as-expired-and-move-routine nil)
319 (when (spam-group-ham-contents-p gnus-newsgroup-name)
320 (when (spam-group-ham-processor-whitelist-p gnus-newsgroup-name)
321 (spam-whitelist-register-routine))
322 (when (spam-group-ham-processor-ifile-p gnus-newsgroup-name)
323 (spam-ifile-register-ham-routine))
324 (when (spam-group-ham-processor-BBDB-p gnus-newsgroup-name)
325 (spam-BBDB-register-routine)))
327 ;; now move all ham articles out of spam groups
328 (when (spam-group-spam-contents-p gnus-newsgroup-name)
329 (spam-ham-move-routine
330 (gnus-parameter-ham-process-destination gnus-newsgroup-name))))
332 (add-hook 'gnus-summary-prepare-exit-hook 'spam-summary-prepare-exit)
334 (defun spam-mark-junk-as-spam-routine ()
335 ;; check the global list of group names spam-junk-mailgroups and the
337 (when (spam-group-spam-contents-p gnus-newsgroup-name)
338 (let ((articles gnus-newsgroup-articles)
341 (setq article (pop articles))
342 (when (eq (gnus-summary-article-mark article) gnus-unread-mark)
343 (gnus-summary-mark-article article gnus-spam-mark))))))
345 (defun spam-mark-spam-as-expired-and-move-routine (&optional group)
346 (let ((articles gnus-newsgroup-articles)
349 (setq article (pop articles))
350 (when (eq (gnus-summary-article-mark article) gnus-spam-mark)
351 (gnus-summary-mark-article article gnus-expirable-mark)
352 (when (stringp group)
353 (let ((gnus-current-article article))
354 (gnus-summary-move-article nil group)))))))
356 (defun spam-ham-move-routine (&optional group)
357 (let ((articles gnus-newsgroup-articles)
358 article ham-mark-values mark)
359 (dolist (mark spam-ham-marks)
360 (push (symbol-value mark) ham-mark-values))
363 (setq article (pop articles))
364 (when (and (memq mark ham-mark-values)
366 (let ((gnus-current-article article))
367 (gnus-summary-move-article nil group))))))
369 (defun spam-generic-register-routine (spam-func ham-func)
370 (let ((articles gnus-newsgroup-articles)
371 article mark ham-articles spam-articles spam-mark-values
374 ;; marks are stored as symbolic values, so we have to dereference
375 ;; them for memq to work. we wouldn't have to do this if
376 ;; gnus-summary-article-mark returned a symbol.
377 (dolist (mark spam-ham-marks)
378 (push (symbol-value mark) ham-mark-values))
380 (dolist (mark spam-spam-marks)
381 (push (symbol-value mark) spam-mark-values))
384 (setq article (pop articles)
385 mark (gnus-summary-article-mark article))
386 (cond ((memq mark spam-mark-values) (push article spam-articles))
387 ((memq article gnus-newsgroup-saved))
388 ((memq mark ham-mark-values) (push article ham-articles))))
389 (when (and ham-articles ham-func)
390 (mapc ham-func ham-articles)) ; we use mapc because unlike
391 ; mapcar it discards the
393 (when (and spam-articles spam-func)
394 (mapc spam-func spam-articles)))) ; we use mapc because unlike
395 ; mapcar it discards the
399 (defalias 'spam-point-at-eol (if (fboundp 'point-at-eol)
401 'line-end-position)))
403 (defun spam-get-article-as-string (article)
404 (let ((article-string))
405 (when (numberp article)
406 (save-window-excursion
407 (gnus-summary-goto-subject article)
408 (gnus-summary-show-article t)
409 (set-buffer gnus-article-buffer)
410 (setq article-string (buffer-string))))
413 (defun spam-fetch-field-from-fast (article)
414 "Fetch the `from' field quickly, using the internal gnus-data-list function"
415 (if (and (numberp article)
416 (assoc article (gnus-data-list nil)))
417 (mail-header-from (gnus-data-header (assoc article (gnus-data-list nil))))
420 (defun spam-fetch-field-subject-fast (article)
421 "Fetch the `subject' field quickly, using the internal gnus-data-list function"
422 (if (and (numberp article)
423 (assoc article (gnus-data-list nil)))
424 (mail-header-subject (gnus-data-header (assoc article (gnus-data-list nil))))
428 ;;;; Spam determination.
430 (defvar spam-list-of-checks
431 '((spam-use-blacklist . spam-check-blacklist)
432 (spam-use-whitelist . spam-check-whitelist)
433 (spam-use-BBDB . spam-check-BBDB)
434 (spam-use-ifile . spam-check-ifile)
435 (spam-use-blackholes . spam-check-blackholes)
436 (spam-use-bogofilter . spam-check-bogofilter))
437 "The spam-list-of-checks list contains pairs associating a parameter
438 variable with a spam checking function. If the parameter variable is
439 true, then the checking function is called, and its value decides what
440 happens. Each individual check may return `nil', `t', or a mailgroup
441 name. The value `nil' means that the check does not yield a decision,
442 and so, that further checks are needed. The value `t' means that the
443 message is definitely not spam, and that further spam checks should be
444 inhibited. Otherwise, a mailgroup name is returned where the mail
445 should go, and further checks are also inhibited. The usual mailgroup
446 name is the value of `spam-split-group', meaning that the message is
450 "Split this message into the `spam' group if it is spam.
451 This function can be used as an entry in `nnmail-split-fancy', for
452 example like this: (: spam-split)
454 See the Info node `(gnus)Fancy Mail Splitting' for more details."
457 (let ((list-of-checks spam-list-of-checks)
459 (while (and list-of-checks (not decision))
460 (let ((pair (pop list-of-checks)))
461 (when (symbol-value (car pair))
462 (setq decision (funcall (cdr pair))))))
469 (defun spam-check-blackholes ()
470 "Check the Received headers for blackholed relays."
471 (let ((headers (message-fetch-field "received"))
476 (goto-char (point-min))
477 (while (re-search-forward
478 "\\[\\([0-9]+.[0-9]+.[0-9]+.[0-9]+\\)\\]" nil t)
479 (message "Blackhole search found host IP %s." (match-string 1))
480 (push (mapconcat 'identity
481 (nreverse (split-string (match-string 1) "\\."))
484 (dolist (server spam-blackhole-servers)
486 (let ((query-string (concat ip "." server)))
488 (let ((query-result (query-dig query-string)))
490 (message "spam: positive blackhole check '%s'" query-result)
491 (push (list ip server query-result)
493 ;; else, if not using dig.el
494 (when (query-dns query-string)
495 (push (list ip server (query-dns query-string 'TXT))
502 ;;; original idea for spam-check-BBDB from Alexander Kotelnikov
503 ;;; <sacha@giotto.sj.ru>
505 ;; all this is done inside a condition-case to trap errors
512 (defun spam-enter-ham-BBDB (from)
513 "Enter an address into the BBDB; implies ham (non-spam) sender"
515 (let* ((parsed-address (gnus-extract-address-components from))
516 (name (or (car parsed-address) "Ham Sender"))
517 (net-address (car (cdr parsed-address))))
518 (message "Adding address %s to BBDB" from)
519 (when (and net-address
520 (not (bbdb-search-simple nil net-address)))
521 (bbdb-create-internal name nil net-address nil nil
522 "ham sender added by spam.el")))))
524 (defun spam-BBDB-register-routine ()
525 (spam-generic-register-routine
530 (spam-enter-ham-BBDB (spam-fetch-field-from-fast article)))))
532 (defun spam-check-BBDB ()
533 "Mail from people in the BBDB is never considered spam"
534 (let ((who (message-fetch-field "from")))
536 (setq who (regexp-quote (cadr
537 (gnus-extract-address-components who))))
538 (if (bbdb-search-simple nil who)
539 nil spam-split-group)))))
542 (defalias 'bbdb-search-simple 'ignore)
543 (defalias 'spam-check-BBDB 'ignore)
544 (defalias 'spam-BBDB-register-routine 'ignore)
545 (defalias 'spam-enter-ham-BBDB 'ignore)
546 (defalias 'bbdb-create-internal 'ignore)
547 (defalias 'bbdb-records 'ignore))))
552 ;;; check the ifile backend; return nil if the mail was NOT classified
555 (defun spam-get-ifile-database-parameter ()
556 "Get the command-line parameter for ifile's database from spam-ifile-database-path."
557 (if spam-ifile-database-path
558 (format "--db-file=%s" spam-ifile-database-path)
561 (defun spam-check-ifile ()
562 "Check the ifile backend for the classification of this message"
563 (let ((article-buffer-name (buffer-name))
566 (let ((temp-buffer-name (buffer-name))
567 (db-param (spam-get-ifile-database-parameter)))
569 (set-buffer article-buffer-name)
571 (call-process-region (point-min) (point-max) spam-ifile-path
572 nil temp-buffer-name nil "-q" "-c" db-param)
573 (call-process-region (point-min) (point-max) spam-ifile-path
574 nil temp-buffer-name nil "-q" "-c")))
575 (goto-char (point-min))
577 (setq category (buffer-substring (point) (spam-point-at-eol))))
578 (when (not (zerop (length category))) ; we need a category here
579 (if spam-ifile-all-categories
580 (setq return category)
581 ;; else, if spam-ifile-all-categories is not set...
582 (when (string-equal spam-ifile-spam-category category)
583 ;; always accept the ifile category
584 (setq return spam-split-group))))))
587 (defun spam-ifile-register-with-ifile (article-string category)
588 "Register an article, given as a string, with a category.
589 Uses `gnus-newsgroup-name' if category is nil (for ham registration)."
590 (when (stringp article-string)
591 (let ((category (or category gnus-newsgroup-name))
592 (db-param (spam-get-ifile-database-parameter)))
594 (insert-string article-string)
596 (call-process-region (point-min) (point-max) spam-ifile-path
598 "-h" "-i" category db-param)
599 (call-process-region (point-min) (point-max) spam-ifile-path
601 "-h" "-i" category))))))
603 (defun spam-ifile-register-spam-routine ()
604 (spam-generic-register-routine
606 (spam-ifile-register-with-ifile
607 (spam-get-article-as-string article) spam-ifile-spam-category))
610 (defun spam-ifile-register-ham-routine ()
611 (spam-generic-register-routine
614 (spam-ifile-register-with-ifile
615 (spam-get-article-as-string article) nil))))
618 ;;;; Blacklists and whitelists.
620 (defvar spam-whitelist-cache nil)
621 (defvar spam-blacklist-cache nil)
623 (defun spam-enter-whitelist (address)
624 "Enter ADDRESS into the whitelist."
625 (interactive "sAddress: ")
626 (spam-enter-list address spam-whitelist)
627 (setq spam-whitelist-cache nil))
629 (defun spam-enter-blacklist (address)
630 "Enter ADDRESS into the blacklist."
631 (interactive "sAddress: ")
632 (spam-enter-list address spam-blacklist)
633 (setq spam-blacklist-cache nil))
635 (defun spam-enter-list (address file)
636 "Enter ADDRESS into the given FILE, either the whitelist or the blacklist."
637 (unless (file-exists-p (file-name-directory file))
638 (make-directory (file-name-directory file) t))
641 (find-file-noselect file))
642 (goto-char (point-max))
645 (insert address "\n")
648 ;;; returns nil if the sender is in the whitelist, spam-split-group otherwise
649 (defun spam-check-whitelist ()
650 ;; FIXME! Should it detect when file timestamps change?
651 (unless spam-whitelist-cache
652 (setq spam-whitelist-cache (spam-parse-list spam-whitelist)))
653 (if (spam-from-listed-p spam-whitelist-cache) nil spam-split-group))
655 (defun spam-check-blacklist ()
656 ;; FIXME! Should it detect when file timestamps change?
657 (unless spam-blacklist-cache
658 (setq spam-blacklist-cache (spam-parse-list spam-blacklist)))
659 (and (spam-from-listed-p spam-blacklist-cache) spam-split-group))
661 (defun spam-parse-list (file)
662 (when (file-readable-p file)
663 (let (contents address)
665 (insert-file-contents file)
667 (setq address (buffer-substring (point) (spam-point-at-eol)))
669 (unless (zerop (length address))
670 (setq address (regexp-quote address))
671 (while (string-match "\\\\\\*" address)
672 (setq address (replace-match ".*" t t address)))
673 (push address contents))))
674 (nreverse contents))))
676 (defun spam-from-listed-p (cache)
677 (let ((from (message-fetch-field "from"))
680 (when (string-match (pop cache) from)
685 (defun spam-blacklist-register-routine ()
686 (spam-generic-register-routine
689 (let ((from (spam-fetch-field-from-fast article)))
691 (spam-enter-blacklist from))))
695 (defun spam-whitelist-register-routine ()
696 (spam-generic-register-routine
701 (let ((from (spam-fetch-field-from-fast article)))
703 (spam-enter-whitelist from))))))
708 ;;; See Paul Graham article, at `http://www.paulgraham.com/spam.html'.
710 ;;; This page is for those wanting to control spam with the help of
711 ;;; Eric Raymond's speedy Bogofilter, see
712 ;;; http://www.tuxedo.org/~esr/bogofilter. This has been tested with
713 ;;; a locally patched copy of version 0.4.
715 ;;; Make sure Bogofilter is installed. Bogofilter internally uses
716 ;;; Judy fast associative arrays, so you need to install Judy first,
717 ;;; and Bogofilter next. Fetch both distributions by visiting the
718 ;;; following links and downloading the latest version of each:
720 ;;; http://sourceforge.net/projects/judy/
721 ;;; http://www.tuxedo.org/~esr/bogofilter/
723 ;;; Unpack the Judy distribution and enter its main directory. Then do:
729 ;;; You will likely need to become super-user for the last step.
730 ;;; Then, unpack the Bogofilter distribution and enter its main
736 ;;; Here as well, you need to become super-user for the last step.
737 ;;; Now, initialize your word lists by doing, under your own identity:
739 ;;; mkdir ~/.bogofilter
740 ;;; touch ~/.bogofilter/badlist
741 ;;; touch ~/.bogofilter/goodlist
743 ;;; These two files are text files you may edit, but you normally don't!
745 ;;; The `M-d' command gets added to Gnus summary mode, marking current
746 ;;; article as spam, showing it with the `H' mark. Whenever you see a
747 ;;; spam article, make sure to mark its summary line with `M-d' before
748 ;;; leaving the group. Some groups, as per variable
749 ;;; `spam-junk-mailgroups' below, receive articles from Gnus splitting
750 ;;; on clues added by spam recognisers, so for these groups, we tack
751 ;;; an `H' mark at group entry for all summary lines which would
752 ;;; otherwise have no other mark. Make sure to _remove_ `H' marks for
753 ;;; any article which is _not_ genuine spam, before leaving such
754 ;;; groups: you may use `M-u' to "unread" the article, or `d' for
755 ;;; declaring it read the non-spam way. When you leave a group, all
756 ;;; `H' marked articles, saved or unsaved, are sent to Bogofilter
757 ;;; which will study them as spam samples.
759 ;;; Messages may also be deleted in various other ways, and unless
760 ;;; `spam-ham-marks-form' gets overridden below, marks `R' and `r' for
761 ;;; default read or explicit delete, marks `X' and 'K' for automatic
762 ;;; or explicit kills, as well as mark `Y' for low scores, are all
763 ;;; considered to be associated with articles which are not spam.
764 ;;; This assumption might be false, in particular if you use kill
765 ;;; files or score files as means for detecting genuine spam, you
766 ;;; should then adjust `spam-ham-marks-form'. When you leave a group,
767 ;;; all _unsaved_ articles bearing any the above marks are sent to
768 ;;; Bogofilter which will study these as not-spam samples. If you
769 ;;; explicit kill a lot, you might sometimes end up with articles
770 ;;; marked `K' which you never saw, and which might accidentally
771 ;;; contain spam. Best is to make sure that real spam is marked with
772 ;;; `H', and nothing else.
774 ;;; All other marks do not contribute to Bogofilter pre-conditioning.
775 ;;; In particular, ticked, dormant or souped articles are likely to
776 ;;; contribute later, when they will get deleted for real, so there is
777 ;;; no need to use them prematurely. Explicitly expired articles do
778 ;;; not contribute, command `E' is a way to get rid of an article
779 ;;; without Bogofilter ever seeing it.
781 ;;; In a word, with a minimum of care for associating the `H' mark for
782 ;;; spam articles only, Bogofilter training all gets fairly automatic.
783 ;;; You should do this until you get a few hundreds of articles in
784 ;;; each category, spam or not. The shell command `head -1
785 ;;; ~/.bogofilter/*' shows both article counts. The command `S S' in
786 ;;; summary mode, either for debugging or for curiosity, triggers
787 ;;; Bogofilter into displaying in another buffer the "spamicity" score
788 ;;; of the current article (between 0.0 and 1.0), together with the
789 ;;; article words which most significantly contribute to the score.
791 ;;; The real way for using Bogofilter, however, is to have some use
792 ;;; tool like `procmail' for invoking it on message reception, then
793 ;;; adding some recognisable header in case of detected spam. Gnus
794 ;;; splitting rules might later trip on these added headers and react
795 ;;; by sorting such articles into specific junk folders as per
796 ;;; `spam-junk-mailgroups'. Here is a possible `.procmailrc' contents
797 ;;; (still untested -- please tell me how it goes):
801 ;;; | formail -bfI "X-Spam-Status: Yes"
803 (defun spam-check-bogofilter ()
804 ;; Dynamic spam check. I do not know how to check the exit status,
805 ;; so instead, read `bogofilter -v' output.
806 (when (and spam-use-bogofilter spam-bogofilter-path)
807 (spam-bogofilter-articles nil "-v" (list (gnus-summary-article-number)))
808 (when (save-excursion
809 (set-buffer spam-bogofilter-output-buffer-name)
810 (goto-char (point-min))
811 (re-search-forward "Spamicity: \\(0\\.9\\|1\\.0\\)" nil t))
814 (defun spam-bogofilter-score ()
815 "Use `bogofilter -v' on the current article.
816 This yields the 15 most discriminant words for this article and the
817 spamicity coefficient of each, and the overall article spamicity."
819 (when (and spam-use-bogofilter spam-bogofilter-path)
820 (spam-bogofilter-articles nil "-v" (list (gnus-summary-article-number)))
821 (with-current-buffer spam-bogofilter-output-buffer-name
822 (unless (zerop (buffer-size))
823 (if (<= (count-lines (point-min) (point-max)) 1)
825 (goto-char (point-max))
828 (message "%s" (buffer-substring (point-min) (point))))
829 (goto-char (point-min))
830 (display-buffer (current-buffer)))))))
832 (defun spam-bogofilter-register-routine ()
833 (let ((articles gnus-newsgroup-articles)
834 article mark ham-articles spam-articles spam-mark-values
837 ;; marks are stored as symbolic values, so we have to dereference
838 ;; them for memq to work we wouldn't have to do this if
839 ;; gnus-summary-article-mark returned a symbol.
840 (dolist (mark spam-ham-marks)
841 (push (symbol-value mark) ham-mark-values))
843 (dolist (mark spam-spam-marks)
844 (push (symbol-value mark) spam-mark-values))
847 (setq article (pop articles)
848 mark (gnus-summary-article-mark article))
849 (cond ((memq mark spam-mark-values) (push article spam-articles))
850 ((memq article gnus-newsgroup-saved))
851 ((memq mark ham-mark-values) (push article ham-articles))))
853 (spam-bogofilter-articles "ham" "-n" ham-articles))
855 (spam-bogofilter-articles "SPAM" "-s" spam-articles))))
857 (defun spam-bogofilter-articles (type option articles)
858 (let ((output-buffer (get-buffer-create spam-bogofilter-output-buffer-name))
859 (article-copy (get-buffer-create " *Bogofilter Article Copy*"))
860 (remove-regexp (concat spam-bogofilter-spaminfo-header-regexp
863 prefix process article)
865 (setq prefix (format "Studying %d articles as %s..." (length articles)
867 (message "%s" prefix))
868 (save-excursion (set-buffer output-buffer) (erase-buffer))
869 (setq process (start-process "bogofilter" output-buffer
870 spam-bogofilter-path "-F" option))
871 (process-kill-without-query process t)
873 (save-window-excursion
875 (setq counter (1+ counter))
877 (message "%s %d" prefix counter))
878 (setq article (pop articles))
879 (gnus-summary-goto-subject article)
880 (gnus-summary-show-article t)
881 (gnus-eval-in-buffer-window article-copy
882 (insert-buffer-substring gnus-original-article-buffer)
883 ;; Remove spam classification redundant headers: they may induce
884 ;; unwanted biases in later analysis.
885 (message-remove-header remove-regexp t)
886 ;; Bogofilter really wants From envelopes for counting articles.
887 ;; Fake one at the beginning, make sure there will be no other.
888 (goto-char (point-min))
889 (if (looking-at "From ")
891 (insert "From nobody " (current-time-string) "\n"))
892 (let (case-fold-search)
893 (while (re-search-forward "^From " nil t)
896 (process-send-region process (point-min) (point-max))
898 ;; Sending the EOF is unwind-protected. This is to prevent lost copies
899 ;; of `bogofilter', hung on reading their standard input, in case the
900 ;; whole registering process gets interrupted by the user.
901 (process-send-eof process))
902 (kill-buffer article-copy)
903 ;; Receive process output. It sadly seems that we still have to protect
904 ;; ourselves against hung `bogofilter' processes.
905 (let ((status (process-status process))
906 (timeout (* 1000 spam-bogofilter-initial-timeout))
907 (quanta 200)) ; also counted in milliseconds
908 (while (and (not (eq status 'exit)) (> timeout 0))
909 ;; `accept-process-output' timeout is counted in microseconds.
910 (setq timeout (if (accept-process-output process 0 (* 1000 quanta))
911 (* 1000 spam-bogofilter-subsequent-timeout)
913 status (process-status process)))
914 (if (eq status 'exit)
916 (message "%s done!" prefix))
917 ;; Sigh! The process did time out... Become brutal!
918 (interrupt-process process)
919 (message "%s %d INTERRUPTED! (Article %d, status %s)"
920 (or prefix "Bogofilter process...")
921 counter article status)
922 ;; Give some time for user to read. Sitting redisplays but gives up
923 ;; if input is pending. Sleeping does not give up, but it does not
924 ;; redisplay either. Mix both: let's redisplay and not give up.
930 ;;; spam.el ends here.