1 ;;; spam-stat.el --- detecting spam based on statistics
3 ;; Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
5 ;; Author: Alex Schroeder <alex@gnu.org>
7 ;; URL: http://www.emacswiki.org/cgi-bin/wiki.pl?SpamStat
9 ;; This file is part of GNU Emacs.
11 ;; This is free software; you can redistribute it and/or modify it
12 ;; under the terms of the GNU General Public License as published by
13 ;; the Free Software Foundation; either version 2, or (at your option)
16 ;; This is distributed in the hope that it will be useful, but WITHOUT
17 ;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
18 ;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
19 ;; License for more details.
21 ;; You should have received a copy of the GNU General Public License
22 ;; along with GNU Emacs; see the file COPYING. If not, write to the
23 ;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 ;; Boston, MA 02111-1307, USA.
28 ;; This implements spam analysis according to Paul Graham in "A Plan
29 ;; for Spam". The basis for all this is a statistical distribution of
30 ;; words for your spam and non-spam mails. We need this information
31 ;; in a hash-table so that the analysis can use the information when
32 ;; looking at your mails. Therefore, before you begin, you need tons
33 ;; of mails (Graham uses 4000 non-spam and 4000 spam mails for his
36 ;; The main interface to using spam-stat, are the following functions:
38 ;; `spam-stat-buffer-is-spam' -- called in a buffer, that buffer is
39 ;; considered to be a new spam mail; use this for new mail that has
40 ;; not been processed before
42 ;; `spam-stat-buffer-is-non-spam' -- called in a buffer, that buffer
43 ;; is considered to be a new non-spam mail; use this for new mail that
44 ;; has not been processed before
46 ;; `spam-stat-buffer-change-to-spam' -- called in a buffer, that
47 ;; buffer is no longer considered to be normal mail but spam; use this
48 ;; to change the status of a mail that has already been processed as
51 ;; `spam-stat-buffer-change-to-non-spam' -- called in a buffer, that
52 ;; buffer is no longer considered to be spam but normal mail; use this
53 ;; to change the status of a mail that has already been processed as
56 ;; `spam-stat-save' -- save the hash table to the file; the filename
57 ;; used is stored in the variable `spam-stat-file'
59 ;; `spam-stat-load' -- load the hash table from a file; the filename
60 ;; used is stored in the variable `spam-stat-file'
62 ;; `spam-stat-score-word' -- return the spam score for a word
64 ;; `spam-stat-score-buffer' -- return the spam score for a buffer
66 ;; `spam-stat-split-fancy' -- for fancy mail splitting; add
67 ;; the rule (: spam-stat-split-fancy) to `nnmail-split-fancy'
69 ;; This requires the following in your ~/.gnus file:
71 ;; (require 'spam-stat)
76 ;; Typical test will involve calls to the following functions:
78 ;; Reset: (spam-stat-reset)
79 ;; Learn spam: (spam-stat-process-spam-directory "~/Mail/mail/spam")
80 ;; Learn non-spam: (spam-stat-process-non-spam-directory "~/Mail/mail/misc")
81 ;; Save table: (spam-stat-save)
82 ;; File size: (nth 7 (file-attributes spam-stat-file))
83 ;; Number of words: (hash-table-count spam-stat)
84 ;; Test spam: (spam-stat-test-directory "~/Mail/mail/spam")
85 ;; Test non-spam: (spam-stat-test-directory "~/Mail/mail/misc")
86 ;; Reduce table size: (spam-stat-reduce-size)
87 ;; Save table: (spam-stat-save)
88 ;; File size: (nth 7 (file-attributes spam-stat-file))
89 ;; Number of words: (hash-table-count spam-stat)
90 ;; Test spam: (spam-stat-test-directory "~/Mail/mail/spam")
91 ;; Test non-spam: (spam-stat-test-directory "~/Mail/mail/misc")
93 ;;; Dictionary Creation:
95 ;; Typically, you will filter away mailing lists etc. using specific
96 ;; rules in `nnmail-split-fancy'. Somewhere among these rules, you
97 ;; will filter spam. Here is how you would create your dictionary:
99 ;; Reset: (spam-stat-reset)
100 ;; Learn spam: (spam-stat-process-spam-directory "~/Mail/mail/spam")
101 ;; Learn non-spam: (spam-stat-process-non-spam-directory "~/Mail/mail/misc")
102 ;; Repeat for any other non-spam group you need...
103 ;; Reduce table size: (spam-stat-reduce-size)
104 ;; Save table: (spam-stat-save)
108 ;; Speed it up. Integrate with Gnus such that it uses spam and expiry
109 ;; marks to call the appropriate functions when leaving the summary
110 ;; buffer and saves the hash table when leaving Gnus. More testing:
111 ;; More mails, disabling SpamAssassin, double checking algorithm, find
112 ;; improved algorithm.
116 ;; Ted Zlatanov <tzz@lifelogs.com>
117 ;; Jesper Harder <harder@myrealbox.com>
118 ;; Dan Schmidt <dfan@dfan.org>
119 ;; Lasse Rasinen <lrasinen@iki.fi>
120 ;; Milan Zamazal <pdm@zamazal.org>
125 (require 'mail-parse)
127 (defgroup spam-stat nil
128 "Statistical spam detection for Emacs.
129 Use the functions to build a dictionary of words and their statistical
130 distribution in spam and non-spam mails. Then use a function to determine
131 whether a buffer contains spam or not."
134 (defcustom spam-stat-file "~/.spam-stat.el"
135 "File used to save and load the dictionary.
136 See `spam-stat-to-hash-table' for the format of the file."
140 (defcustom spam-stat-install-hooks t
141 "Whether spam-stat should install its hooks in Gnus.
142 This is set to nil if you use spam-stat through spam.el."
146 (defcustom spam-stat-unknown-word-score 0.2
147 "The score to use for unknown words.
148 Also used for words that don't appear often enough."
152 (defcustom spam-stat-max-word-length 15
153 "Only words shorter than this will be considered."
157 (defcustom spam-stat-max-buffer-length 10240
158 "Only the beginning of buffers will be analyzed.
159 This variable says how many characters this will be."
163 (defcustom spam-stat-split-fancy-spam-group "mail.spam"
164 "Name of the group where spam should be stored, if
165 `spam-stat-split-fancy' is used in fancy splitting rules. Has no
166 effect when spam-stat is invoked through spam.el."
170 (defcustom spam-stat-split-fancy-spam-threshhold 0.9
171 "Spam score threshhold in spam-stat-split-fancy."
175 (defcustom spam-stat-process-directory-age 90
176 "Max. age of files to be processed in directory, in days.
177 When using `spam-stat-process-spam-directory' or
178 `spam-stat-process-non-spam-directory', only files that have
179 been touched in this many days will be considered. Without
180 this filter, re-training spam-stat with several thousand messages
181 will start to take a very long time.")
183 (defvar spam-stat-syntax-table
184 (let ((table (copy-syntax-table text-mode-syntax-table)))
185 (modify-syntax-entry ?- "w" table)
186 (modify-syntax-entry ?_ "w" table)
187 (modify-syntax-entry ?. "w" table)
188 (modify-syntax-entry ?! "w" table)
189 (modify-syntax-entry ?? "w" table)
190 (modify-syntax-entry ?+ "w" table)
192 "Syntax table used when processing mails for statistical analysis.
193 The important part is which characters are word constituents.")
195 (defvar spam-stat-dirty nil
196 "Whether the spam-stat database needs saving.")
198 (defvar spam-stat-buffer nil
199 "Buffer to use for scoring while splitting.
200 This is set by hooking into Gnus.")
202 (defvar spam-stat-buffer-name " *spam stat buffer*"
203 "Name of the `spam-stat-buffer'.")
207 (defun spam-stat-store-current-buffer ()
208 "Store a copy of the current buffer in `spam-stat-buffer'."
209 (let ((buf (current-buffer)))
210 (with-current-buffer (get-buffer-create spam-stat-buffer-name)
212 (insert-buffer-substring buf)
213 (setq spam-stat-buffer (current-buffer)))))
215 (defun spam-stat-store-gnus-article-buffer ()
216 "Store a copy of the current article in `spam-stat-buffer'.
217 This uses `gnus-article-buffer'."