;;; spam-stat.el --- detecting spam based on statistics
-;; Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
+;; Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
+;; 2010 Free Software Foundation, Inc.
;; Author: Alex Schroeder <alex@gnu.org>
;; Keywords: network
;; This file is part of GNU Emacs.
-;; This is free software; you can redistribute it and/or modify it
-;; under the terms of the GNU General Public License as published by
-;; the Free Software Foundation; either version 2, or (at your option)
-;; any later version.
+;; GNU Emacs is free software: you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation, either version 3 of the License, or
+;; (at your option) any later version.
-;; This is distributed in the hope that it will be useful, but WITHOUT
-;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
-;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
-;; License for more details.
+;; GNU Emacs is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;; GNU General Public License for more details.
;; You should have received a copy of the GNU General Public License
-;; along with GNU Emacs; see the file COPYING. If not, write to the
-;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-;; Boston, MA 02111-1307, USA.
+;; along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>.
;;; Commentary:
;;; Code:
(require 'mail-parse)
+(defvar gnus-original-article-buffer)
+
(defgroup spam-stat nil
"Statistical spam detection for Emacs.
Use the functions to build a dictionary of words and their statistical
distribution in spam and non-spam mails. Then use a function to determine
whether a buffer contains spam or not."
+ :version "22.1"
:group 'gnus)
(defcustom spam-stat-file "~/.spam-stat.el"
:group 'spam-stat)
(defcustom spam-stat-split-fancy-spam-group "mail.spam"
- "Name of the group where spam should be stored, if
-`spam-stat-split-fancy' is used in fancy splitting rules. Has no
-effect when spam-stat is invoked through spam.el."
+ "Name of the group where spam should be stored.
+If `spam-stat-split-fancy' is used in fancy splitting rules. Has
+no effect when spam-stat is invoked through spam.el."
:type 'string
:group 'spam-stat)
-(defcustom spam-stat-split-fancy-spam-threshhold 0.9
- "Spam score threshhold in spam-stat-split-fancy."
+(defcustom spam-stat-split-fancy-spam-threshold 0.9
+ "Spam score threshold in spam-stat-split-fancy."
:type 'number
:group 'spam-stat)
+(defcustom spam-stat-washing-hook nil
+ "Hook applied to each message before analysis."
+ :type 'hook
+ :group 'spam-stat)
+
+(defcustom spam-stat-score-buffer-user-functions nil
+ "List of additional scoring functions.
+Called one by one on the buffer.
+
+If all of these functions return non-nil answers, these numerical
+answers are added to the computed spam stat score on the buffer. If
+you defun such functions, make sure they don't return the buffer in a
+narrowed state or such: use, for example, `save-excursion'. Each of
+your functions is also passed the initial spam-stat score which might
+aid in your scoring.
+
+Also be careful when defining such functions. If they take a long
+time, they will slow down your mail splitting. Thus, if the buffer is
+large, don't forget to use smaller regions, by wrapping your work in,
+say, `with-spam-stat-max-buffer-size'."
+ :type '(repeat sexp)
+ :group 'spam-stat)
+
(defcustom spam-stat-process-directory-age 90
"Max. age of files to be processed in directory, in days.
When using `spam-stat-process-spam-directory' or
`spam-stat-process-non-spam-directory', only files that have
been touched in this many days will be considered. Without
this filter, re-training spam-stat with several thousand messages
-will start to take a very long time.")
+will start to take a very long time."
+ :type 'number
+ :group 'spam-stat)
+
+(defvar spam-stat-last-saved-at nil
+ "Time stamp of last change of spam-stat-file on this run")
(defvar spam-stat-syntax-table
(let ((table (copy-syntax-table text-mode-syntax-table)))
(defvar spam-stat-buffer-name " *spam stat buffer*"
"Name of the `spam-stat-buffer'.")
+(defvar spam-stat-coding-system
+ (if (mm-coding-system-p 'emacs-mule) 'emacs-mule 'raw-text)
+ "Coding system used for `spam-stat-file'.")
+
;; Hooking into Gnus
(defun spam-stat-store-current-buffer ()
(defvar spam-stat-nbad 0
"The number of bad mails in the dictionary.")
+(defvar spam-stat-error-holder nil
+ "A holder for condition-case errors while scoring buffers.")
+
(defsubst spam-stat-good (entry)
"Return the number of times this word belongs to good mails."
(aref entry 0))
;; Parsing
(defmacro with-spam-stat-max-buffer-size (&rest body)
- "Narrows the buffer down to the first 4k characters, then evaluates BODY."
+ "Narrow the buffer down to the first 4k characters, then evaluate BODY."
`(save-restriction
(when (> (- (point-max)
(point-min))
,@body))
(defun spam-stat-buffer-words ()
- "Return a hash table of words and number of occurences in the buffer."
+ "Return a hash table of words and number of occurrences in the buffer."
+ (run-hooks 'spam-stat-washing-hook)
(with-spam-stat-max-buffer-size
(with-syntax-table spam-stat-syntax-table
(goto-char (point-min))
(spam-stat-buffer-words))
(setq spam-stat-dirty t))
+(autoload 'gnus-message "gnus-util")
+
(defun spam-stat-buffer-change-to-spam ()
"Consider current buffer no longer normal mail but spam."
(setq spam-stat-nbad (1+ spam-stat-nbad)
(lambda (word count)
(let ((entry (gethash word spam-stat)))
(if (not entry)
- (error "This buffer has unknown words in it.")
+ (gnus-message 8 "This buffer has unknown words in it")
(spam-stat-set-good entry (- (spam-stat-good entry) count))
(spam-stat-set-bad entry (+ (spam-stat-bad entry) count))
(spam-stat-set-score entry (spam-stat-compute-score entry))