;;; spam-stat.el --- detecting spam based on statistics
-;; Copyright (C) 2002, 2003 Free Software Foundation, Inc.
+;; Copyright (C) 2002, 2003, 2004, 2005, 2006 Free Software Foundation, Inc.
;; Author: Alex Schroeder <alex@gnu.org>
;; Keywords: network
;; You should have received a copy of the GNU General Public License
;; along with GNU Emacs; see the file COPYING. If not, write to the
-;; Free Software Foundation, Inc., 59 Temple Place - Suite 330,
-;; Boston, MA 02111-1307, USA.
+;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
+;; Boston, MA 02110-1301, USA.
;;; Commentary:
;;; Code:
(require 'mail-parse)
+(defvar gnus-original-article-buffer)
+
(defgroup spam-stat nil
"Statistical spam detection for Emacs.
Use the functions to build a dictionary of words and their statistical
distribution in spam and non-spam mails. Then use a function to determine
whether a buffer contains spam or not."
+ :version "22.1"
:group 'gnus)
(defcustom spam-stat-file "~/.spam-stat.el"
:group 'spam-stat)
(defcustom spam-stat-split-fancy-spam-group "mail.spam"
- "Name of the group where spam should be stored, if
-`spam-stat-split-fancy' is used in fancy splitting rules. Has no
-effect when spam-stat is invoked through spam.el."
+ "Name of the group where spam should be stored.
+If `spam-stat-split-fancy' is used in fancy splitting rules. Has
+no effect when spam-stat is invoked through spam.el."
:type 'string
:group 'spam-stat)
-(defcustom spam-stat-split-fancy-spam-threshhold 0.9
- "Spam score threshhold in spam-stat-split-fancy."
+(defcustom spam-stat-split-fancy-spam-threshold 0.9
+ "Spam score threshold in spam-stat-split-fancy."
+ :type 'number
+ :group 'spam-stat)
+
+(defcustom spam-stat-washing-hook nil
+ "Hook applied to each message before analysis."
+ :type 'hook
+ :group 'spam-stat)
+
+(defcustom spam-stat-score-buffer-user-functions nil
+ "List of additional scoring functions.
+Called one by one on the buffer.
+
+If all of these functions return non-nil answers, these numerical
+answers are added to the computed spam stat score on the buffer. If
+you defun such functions, make sure they don't return the buffer in a
+narrowed state or such: use, for example, `save-excursion'. Each of
+your functions is also passed the initial spam-stat score which might
+aid in your scoring.
+
+Also be careful when defining such functions. If they take a long
+time, they will slow down your mail splitting. Thus, if the buffer is
+large, don't forget to use smaller regions, by wrapping your work in,
+say, `with-spam-stat-max-buffer-size'."
+ :type '(repeat sexp)
+ :group 'spam-stat)
+
+(defcustom spam-stat-process-directory-age 90
+ "Max. age of files to be processed in directory, in days.
+When using `spam-stat-process-spam-directory' or
+`spam-stat-process-non-spam-directory', only files that have
+been touched in this many days will be considered. Without
+this filter, re-training spam-stat with several thousand messages
+will start to take a very long time."
:type 'number
:group 'spam-stat)
+(defvar spam-stat-last-saved-at nil
+ "Time stamp of last change of spam-stat-file on this run")
+
(defvar spam-stat-syntax-table
(let ((table (copy-syntax-table text-mode-syntax-table)))
(modify-syntax-entry ?- "w" table)
(defvar spam-stat-buffer-name " *spam stat buffer*"
"Name of the `spam-stat-buffer'.")
+(defvar spam-stat-coding-system
+ (if (mm-coding-system-p 'emacs-mule) 'emacs-mule 'raw-text)
+ "Coding system used for `spam-stat-file'.")
+
;; Hooking into Gnus
(defun spam-stat-store-current-buffer ()
(defvar spam-stat-nbad 0
"The number of bad mails in the dictionary.")
+(defvar spam-stat-error-holder nil
+ "A holder for condition-case errors while scoring buffers.")
+
(defsubst spam-stat-good (entry)
"Return the number of times this word belongs to good mails."
(aref entry 0))
;; Parsing
(defmacro with-spam-stat-max-buffer-size (&rest body)
- "Narrows the buffer down to the first 4k characters, then evaluates BODY."
+ "Narrow the buffer down to the first 4k characters, then evaluate BODY."
`(save-restriction
(when (> (- (point-max)
(point-min))
,@body))
(defun spam-stat-buffer-words ()
- "Return a hash table of words and number of occurences in the buffer."
+ "Return a hash table of words and number of occurrences in the buffer."
+ (run-hooks 'spam-stat-washing-hook)
(with-spam-stat-max-buffer-size
(with-syntax-table spam-stat-syntax-table
(goto-char (point-min))
(lambda (word count)
(let ((entry (gethash word spam-stat)))
(if (not entry)
- (error "This buffer has unknown words in it.")
+ (gnus-message 8 "This buffer has unknown words in it")
(spam-stat-set-good entry (- (spam-stat-good entry) count))
(spam-stat-set-bad entry (+ (spam-stat-bad entry) count))
(spam-stat-set-score entry (spam-stat-compute-score entry))
(lambda (word count)
(let ((entry (gethash word spam-stat)))
(if (not entry)
- (error "This buffer has unknown words in it.")
+ (gnus-message 8 "This buffer has unknown words in it")
(spam-stat-set-good entry (+ (spam-stat-good entry) count))
(spam-stat-set-bad entry (- (spam-stat-bad entry) count))
(spam-stat-set-score entry (spam-stat-compute-score entry))
;; Saving and Loading
(defun spam-stat-save (&optional force)
- "Save the `spam-stat' hash table as lisp file."
- (interactive)
+ "Save the `spam-stat' hash table as lisp file.
+With a prefix argument save unconditionally."
+ (interactive "P")
(when (or force spam-stat-dirty)
- (with-temp-buffer
- (let ((standard-output (current-buffer))
- (font-lock-maximum-size 0))
- (insert "(setq spam-stat-ngood "
- (number-to-string spam-stat-ngood)
- " spam-stat-nbad "
- (number-to-string spam-stat-nbad)
- " spam-stat (spam-stat-to-hash-table '(")
- (maphash (lambda (word entry)
- (prin1 (list word
- (spam-stat-good entry)
- (spam-stat-bad entry))))
- spam-stat)
- (insert ")))")
- (write-file spam-stat-file)))
- (setq spam-stat-dirty nil)))
+ (let ((coding-system-for-write spam-stat-coding-system))
+ (with-temp-file spam-stat-file
+ (let ((standard-output (current-buffer))
+ (font-lock-maximum-size 0))
+ (insert (format ";-*- coding: %s; -*-\n" spam-stat-coding-system))
+ (insert (format "(setq spam-stat-ngood %d spam-stat-nbad %d
+spam-stat (spam-stat-to-hash-table '(" spam-stat-ngood spam-stat-nbad))
+ (maphash (lambda (word entry)
+ (prin1 (list word
+ (spam-stat-good entry)
+ (spam-stat-bad entry))))
+ spam-stat)
+ (insert ")))"))))
+ (message "Saved %s." spam-stat-file)
+ (setq spam-stat-dirty nil
+ spam-stat-last-saved-at (nth 5 (file-attributes spam-stat-file)))))
(defun spam-stat-load ()
"Read the `spam-stat' hash table from disk."
;; TODO: maybe we should warn the user if spam-stat-dirty is t?
- (load-file spam-stat-file)
- (setq spam-stat-dirty nil))
+ (let ((coding-system-for-read spam-stat-coding-system))
+ (cond (spam-stat-dirty (message "Spam stat not loaded: spam-stat-dirty t"))
+ ((or (not (boundp 'spam-stat-last-saved-at))
+ (null spam-stat-last-saved-at)
+ (not (equal spam-stat-last-saved-at
+ (nth 5 (file-attributes spam-stat-file)))))
+ (progn
+ (load-file spam-stat-file)
+ (setq spam-stat-dirty nil
+ spam-stat-last-saved-at
+ (nth 5 (file-attributes spam-stat-file)))))
+ (t (message "Spam stat file not loaded: no change in disk..")))))
(defun spam-stat-to-hash-table (entries)
"Turn list ENTRIES into a hash table and store as `spam-stat'.
result))
(defun spam-stat-score-buffer ()
- "Return a score describing the spam-probability for this buffer."
+ "Return a score describing the spam-probability for this buffer.
+Add user supplied modifications if supplied."
+ (interactive) ; helps in debugging.
(setq spam-stat-score-data (spam-stat-buffer-words-with-scores))
- (let* ((probs (mapcar (lambda (e) (cadr e)) spam-stat-score-data))
- (prod (apply #'* probs)))
- (/ prod (+ prod (apply #'* (mapcar #'(lambda (x) (- 1 x))
- probs))))))
+ (let* ((probs (mapcar 'cadr spam-stat-score-data))
+ (prod (apply #'* probs))
+ (score0
+ (/ prod (+ prod (apply #'* (mapcar #'(lambda (x) (- 1 x))
+ probs)))))
+ (score1s
+ (condition-case
+ spam-stat-error-holder
+ (spam-stat-score-buffer-user score0)
+ (error nil)))
+ (ans
+ (if score1s (+ score0 score1s) score0)))
+ (when (interactive-p)
+ (message "%S" ans))
+ ans))
+
+(defun spam-stat-score-buffer-user (&rest args)
+ (let* ((scores
+ (mapcar
+ (lambda (fn)
+ (apply fn args))
+ spam-stat-score-buffer-user-functions)))
+ (if (memq nil scores) nil
+ (apply #'+ scores))))
(defun spam-stat-split-fancy ()
"Return the name of the spam group if the current mail is spam.
Use this function on `nnmail-split-fancy'. If you are interested in
the raw data used for the last run of `spam-stat-score-buffer',
check the variable `spam-stat-score-data'."
- (condition-case var
+ (condition-case spam-stat-error-holder
(progn
(set-buffer spam-stat-buffer)
(goto-char (point-min))
- (when (> (spam-stat-score-buffer) spam-stat-split-fancy-spam-threshhold)
+ (when (> (spam-stat-score-buffer) spam-stat-split-fancy-spam-threshold)
(when (boundp 'nnmail-split-trace)
(mapc (lambda (entry)
(push entry nnmail-split-trace))
spam-stat-score-data))
spam-stat-split-fancy-spam-group))
- (error (message "Error in spam-stat-split-fancy: %S" var)
+ (error (message "Error in spam-stat-split-fancy: %S" spam-stat-error-holder)
nil)))
;; Testing
(dolist (f files)
(when (and (file-readable-p f)
(file-regular-p f)
- (> (nth 7 (file-attributes f)) 0))
+ (> (nth 7 (file-attributes f)) 0)
+ (< (time-to-number-of-days (time-since (nth 5 (file-attributes f))))
+ spam-stat-process-directory-age))
(setq count (1+ count))
(message "Reading %s: %.2f%%" dir (/ count max))
- (insert-file-contents f)
+ (insert-file-contents-literally f)
(spam-stat-strip-xref)
(funcall func)
(erase-buffer))))))
(setq count (1+ count))
(message "Reading %.2f%%, score %.2f"
(/ count max) (/ score count))
- (insert-file-contents f)
+ (insert-file-contents-literally f)
(setq buffer-score (spam-stat-score-buffer))
(when (> buffer-score 0.9)
(setq score (1+ score)))
(spam-stat-bad entry))
count)
(remhash key spam-stat)))
- spam-stat))
+ spam-stat)
+ (setq spam-stat-dirty t))
(defun spam-stat-install-hooks-function ()
- "Install the spam-stat function hooks"
+ "Install the spam-stat function hooks."
(interactive)
(add-hook 'nnmail-prepare-incoming-message-hook
'spam-stat-store-current-buffer)
(spam-stat-install-hooks-function))
(defun spam-stat-unload-hook ()
- "Uninstall the spam-stat function hooks"
+ "Uninstall the spam-stat function hooks."
(interactive)
(remove-hook 'nnmail-prepare-incoming-message-hook
'spam-stat-store-current-buffer)
(remove-hook 'gnus-select-article-hook
'spam-stat-store-gnus-article-buffer))
+(add-hook 'spam-stat-unload-hook 'spam-stat-unload-hook)
+
(provide 'spam-stat)
+;;; arch-tag: ff1d2200-8ddb-42fb-bb7b-1b5e20448554
;;; spam-stat.el ends here