;;; spam-stat.el --- detecting spam based on statistics
-;; Copyright (C) 2002 Alex Schroeder
+;; Copyright (C) 2002 Free Software Foundation, Inc.
;; Author: Alex Schroeder <alex@gnu.org>
-;; Maintainer: Alex Schroeder <alex@gnu.org>
-;; Version: 0.3.4
-;; Keywords: spam filtering gnus
+;; Keywords: network
;; URL: http://www.emacswiki.org/cgi-bin/wiki.pl?SpamStat
-;; This file is NOT part of GNU Emacs.
+;; This file is part of GNU Emacs.
;; This is free software; you can redistribute it and/or modify it
;; under the terms of the GNU General Public License as published by
;; considered to be a new spam mail; use this for new mail that has
;; not been processed before
;;
-;; `spam-stat-buffer-is-no-spam' -- called in a buffer, that buffer
+;; `spam-stat-buffer-is-non-spam' -- called in a buffer, that buffer
;; is considered to be a new non-spam mail; use this for new mail that
;; has not been processed before
;;
;; Typical test will involve calls to the following functions:
;;
-;; Reset: (setq spam-stat (make-hash-table :test 'equal))
+;; Reset: (spam-stat-reset)
;; Learn spam: (spam-stat-process-spam-directory "~/Mail/mail/spam")
;; Learn non-spam: (spam-stat-process-non-spam-directory "~/Mail/mail/misc")
;; Save table: (spam-stat-save)
;; rules in `nnmail-split-fancy'. Somewhere among these rules, you
;; will filter spam. Here is how you would create your dictionary:
-;; Reset: (setq spam-stat (make-hash-table :test 'equal))
+;; Reset: (spam-stat-reset)
;; Learn spam: (spam-stat-process-spam-directory "~/Mail/mail/spam")
;; Learn non-spam: (spam-stat-process-non-spam-directory "~/Mail/mail/misc")
;; Repeat for any other non-spam group you need...
;; Ted Zlatanov <tzz@lifelogs.com>
;; Jesper Harder <harder@myrealbox.com>
;; Dan Schmidt <dfan@dfan.org>
+;; Lasse Rasinen <lrasinen@iki.fi>
+;; Milan Zamazal <pdm@zamazal.org>
\f
"Statistical spam detection for Emacs.
Use the functions to build a dictionary of words and their statistical
distribution in spam and non-spam mails. Then use a function to determine
-wether a buffer contains spam or not."
+whether a buffer contains spam or not."
:group 'gnus)
(defcustom spam-stat-file "~/.spam-stat.el"
:type 'file
:group 'spam-stat)
+(defcustom spam-stat-install-hooks t
+ "Whether spam-stat should install its hooks in Gnus.
+This is set to nil if you use spam-stat through spam.el."
+ :type 'boolean
+ :group 'spam-stat)
+
(defcustom spam-stat-unknown-word-score 0.2
"The score to use for unknown words.
Also used for words that don't appear often enough."
(defcustom spam-stat-split-fancy-spam-group "mail.spam"
"Name of the group where spam should be stored, if
-`spam-stat-split-fancy' is used in fancy splitting rules."
+`spam-stat-split-fancy' is used in fancy splitting rules. Has no
+effect when spam-stat is invoked through spam.el."
:type 'string
:group 'spam-stat)
+(defcustom spam-stat-split-fancy-spam-threshhold 0.9
+ "Spam score threshhold in spam-stat-split-fancy."
+ :type 'number
+ :group 'spam-stat)
+
(defvar spam-stat-syntax-table
(let ((table (copy-syntax-table text-mode-syntax-table)))
(modify-syntax-entry ?- "w" table)
;; Functions missing in Emacs 20
-(eval-and-compile
- (when (and (not (featurep 'xemacs))
- (= emacs-major-version 20))
- ;; gethash, hash-table-count, make-hash-table, mapc
- (require 'cl)
+(when (memq nil (mapcar 'fboundp
+ '(gethash hash-table-count make-hash-table
+ mapc puthash)))
+ (require 'cl)
+ (unless (fboundp 'puthash)
+ ;; alias puthash is missing from Emacs 20 cl-extra.el
(defalias 'puthash 'cl-puthash)))
(eval-when-compile
(set-buffer gnus-original-article-buffer)
(spam-stat-store-current-buffer)))
-(add-hook 'nnmail-prepare-incoming-message-hook
- 'spam-stat-store-current-buffer)
-(add-hook 'gnus-select-article-hook
- 'spam-stat-store-gnus-article-buffer)
+(when spam-stat-install-hooks
+ (add-hook 'nnmail-prepare-incoming-message-hook
+ 'spam-stat-store-current-buffer)
+ (add-hook 'gnus-select-article-hook
+ 'spam-stat-store-gnus-article-buffer))
;; Data -- not using defstruct in order to save space and time
(interactive)
(with-temp-buffer
(let ((standard-output (current-buffer)))
- (insert "(setq spam-stat (spam-stat-to-hash-table '(")
+ (insert "(setq spam-stat-ngood "
+ (number-to-string spam-stat-ngood)
+ " spam-stat-nbad "
+ (number-to-string spam-stat-nbad)
+ " spam-stat (spam-stat-to-hash-table '(")
(maphash (lambda (word entry)
(prin1 (list word
(spam-stat-good entry)
(spam-stat-bad entry))))
spam-stat)
- (insert ")) spam-stat-ngood "
- (number-to-string spam-stat-ngood)
- " spam-stat-nbad "
- (number-to-string spam-stat-nbad)
- ")"))
+ (insert ")))"))
(write-file spam-stat-file)))
(defun spam-stat-load ()
"Reset `spam-stat' to an empty hash-table.
This deletes all the statistics."
(interactive)
- (setq spam-stat (make-hash-table :test 'equal)))
+ (setq spam-stat (make-hash-table :test 'equal)
+ spam-stat-ngood 0
+ spam-stat-nbad 0))
;; Scoring buffers
`spam-stat-unknown-word-score'."
(spam-stat-score (gethash word spam-stat)))
-
(defun spam-stat-buffer-words-with-scores ()
"Process current buffer, return the 15 most conspicuous words.
These are the words whose spam-stat differs the most from 0.5.
(progn
(set-buffer spam-stat-buffer)
(goto-char (point-min))
- (when (> (spam-stat-score-buffer) 0.9)
+ (when (> (spam-stat-score-buffer) spam-stat-split-fancy-spam-threshhold)
(when (boundp 'nnmail-split-trace)
(mapc (lambda (entry)
(push entry nnmail-split-trace))
(when (and (file-readable-p f)
(file-regular-p f))
(setq count (1+ count))
- (message "Reading %.2f%%" (/ count max))
+ (message "Reading %s: %.2f%%" dir (/ count max))
(insert-file-contents f)
(funcall func)
(erase-buffer))))))
;; Shrinking the dictionary
-(defun spam-stat-reduce-size (&optional count distance)
+(defun spam-stat-reduce-size (&optional count)
"Reduce the size of `spam-stat'.
This removes all words that occur less than COUNT from the dictionary.
-COUNT defaults to 5. It also removes all words whose spam score
-is less than DISTANCE from 0.5. DISTANCE defaults to 0.1, meaning that
-all words with score between 0.4 and 0.6 are removed."
+COUNT defaults to 5"
(interactive)
- (setq count (or count 5)
- distance (or distance 0.1))
+ (setq count (or count 5))
(maphash (lambda (key entry)
- (when (or (< (+ (spam-stat-good entry)
- (spam-stat-bad entry))
- count)
- (< (abs (- (spam-stat-score entry) 0.5))
- distance))
+ (when (< (+ (spam-stat-good entry)
+ (spam-stat-bad entry))
+ count)
(remhash key spam-stat)))
spam-stat))
(provide 'spam-stat)
;;; spam-stat.el ends here
-