X-Git-Url: https://cgit.sxemacs.org/?a=blobdiff_plain;f=lisp%2Fspam-stat.el;h=f4c5ec93b1fac92f02df3520ee1607b20ff9804c;hb=2f199ef9b4f7b43022f35ac5d00cd7e9389fa774;hp=70d20bccf4ec65237f554f82ffdf83be353ce0f4;hpb=341ed1f5b7b690a864028d3f8b7c2e88135e6a9f;p=gnus diff --git a/lisp/spam-stat.el b/lisp/spam-stat.el index 70d20bccf..f4c5ec93b 100644 --- a/lisp/spam-stat.el +++ b/lisp/spam-stat.el @@ -1,14 +1,12 @@ ;;; spam-stat.el --- detecting spam based on statistics -;; Copyright (C) 2002 Alex Schroeder +;; Copyright (C) 2002, 2003 Free Software Foundation, Inc. ;; Author: Alex Schroeder -;; Maintainer: Alex Schroeder -;; Version: 0.3.3 -;; Keywords: spam filtering gnus +;; Keywords: network ;; URL: http://www.emacswiki.org/cgi-bin/wiki.pl?SpamStat -;; This file is NOT part of GNU Emacs. +;; This file is part of GNU Emacs. ;; This is free software; you can redistribute it and/or modify it ;; under the terms of the GNU General Public License as published by @@ -41,7 +39,7 @@ ;; considered to be a new spam mail; use this for new mail that has ;; not been processed before ;; -;; `spam-stat-buffer-is-no-spam' -- called in a buffer, that buffer +;; `spam-stat-buffer-is-non-spam' -- called in a buffer, that buffer ;; is considered to be a new non-spam mail; use this for new mail that ;; has not been processed before ;; @@ -69,7 +67,7 @@ ;; the rule (: spam-stat-split-fancy) to `nnmail-split-fancy' ;; ;; This requires the following in your ~/.gnus file: -;; +;; ;; (require 'spam-stat) ;; (spam-stat-load) @@ -77,7 +75,7 @@ ;; Typical test will involve calls to the following functions: ;; -;; Reset: (setq spam-stat (make-hash-table :test 'equal)) +;; Reset: (spam-stat-reset) ;; Learn spam: (spam-stat-process-spam-directory "~/Mail/mail/spam") ;; Learn non-spam: (spam-stat-process-non-spam-directory "~/Mail/mail/misc") ;; Save table: (spam-stat-save) @@ -98,7 +96,7 @@ ;; rules in `nnmail-split-fancy'. Somewhere among these rules, you ;; will filter spam. Here is how you would create your dictionary: -;; Reset: (setq spam-stat (make-hash-table :test 'equal)) +;; Reset: (spam-stat-reset) ;; Learn spam: (spam-stat-process-spam-directory "~/Mail/mail/spam") ;; Learn non-spam: (spam-stat-process-non-spam-directory "~/Mail/mail/misc") ;; Repeat for any other non-spam group you need... @@ -118,6 +116,8 @@ ;; Ted Zlatanov ;; Jesper Harder ;; Dan Schmidt +;; Lasse Rasinen +;; Milan Zamazal @@ -127,7 +127,7 @@ "Statistical spam detection for Emacs. Use the functions to build a dictionary of words and their statistical distribution in spam and non-spam mails. Then use a function to determine -wether a buffer contains spam or not." +whether a buffer contains spam or not." :group 'gnus) (defcustom spam-stat-file "~/.spam-stat.el" @@ -136,6 +136,12 @@ See `spam-stat-to-hash-table' for the format of the file." :type 'file :group 'spam-stat) +(defcustom spam-stat-install-hooks t + "Whether spam-stat should install its hooks in Gnus. +This is set to nil if you use spam-stat through spam.el." + :type 'boolean + :group 'spam-stat) + (defcustom spam-stat-unknown-word-score 0.2 "The score to use for unknown words. Also used for words that don't appear often enough." @@ -155,10 +161,16 @@ This variable says how many characters this will be." (defcustom spam-stat-split-fancy-spam-group "mail.spam" "Name of the group where spam should be stored, if -`spam-stat-split-fancy' is used in fancy splitting rules." +`spam-stat-split-fancy' is used in fancy splitting rules. Has no +effect when spam-stat is invoked through spam.el." :type 'string :group 'spam-stat) +(defcustom spam-stat-split-fancy-spam-threshhold 0.9 + "Spam score threshhold in spam-stat-split-fancy." + :type 'number + :group 'spam-stat) + (defvar spam-stat-syntax-table (let ((table (copy-syntax-table text-mode-syntax-table))) (modify-syntax-entry ?- "w" table) @@ -178,6 +190,36 @@ This is set by hooking into Gnus.") (defvar spam-stat-buffer-name " *spam stat buffer*" "Name of the `spam-stat-buffer'.") +;; Functions missing in Emacs 20 + +(when (memq nil (mapcar 'fboundp + '(gethash hash-table-count make-hash-table + mapc puthash))) + (require 'cl) + (unless (fboundp 'puthash) + ;; alias puthash is missing from Emacs 20 cl-extra.el + (defalias 'puthash 'cl-puthash))) + +(eval-when-compile + (unless (fboundp 'with-syntax-table) + ;; Imported from Emacs 21.2 + (defmacro with-syntax-table (table &rest body) "\ +Evaluate BODY with syntax table of current buffer set to a copy of TABLE. +The syntax table of the current buffer is saved, BODY is evaluated, and the +saved table is restored, even in case of an abnormal exit. +Value is what BODY returns." + (let ((old-table (make-symbol "table")) + (old-buffer (make-symbol "buffer"))) + `(let ((,old-table (syntax-table)) + (,old-buffer (current-buffer))) + (unwind-protect + (progn + (set-syntax-table (copy-syntax-table ,table)) + ,@body) + (save-current-buffer + (set-buffer ,old-buffer) + (set-syntax-table ,old-table)))))))) + ;; Hooking into Gnus (defun spam-stat-store-current-buffer () @@ -196,10 +238,11 @@ This uses `gnus-article-buffer'." (set-buffer gnus-original-article-buffer) (spam-stat-store-current-buffer))) -(add-hook 'nnmail-prepare-incoming-message-hook - 'spam-stat-store-current-buffer) -(add-hook 'gnus-select-article-hook - 'spam-stat-store-gnus-article-buffer) +(when spam-stat-install-hooks + (add-hook 'nnmail-prepare-incoming-message-hook + 'spam-stat-store-current-buffer) + (add-hook 'gnus-select-article-hook + 'spam-stat-store-gnus-article-buffer)) ;; Data -- not using defstruct in order to save space and time @@ -313,12 +356,12 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', (lambda (word count) (let ((entry (gethash word spam-stat))) (if entry - (spam-stat-set-good entry (+ count (spam-stat-good entry))) + (spam-stat-set-good entry (+ count (spam-stat-good entry))) (setq entry (spam-stat-make-entry count 0))) (spam-stat-set-score entry (spam-stat-compute-score entry)) (puthash word entry spam-stat))) (spam-stat-buffer-words))) - + (defun spam-stat-buffer-change-to-spam () "Consider current buffer no longer normal mail but spam." (setq spam-stat-nbad (1+ spam-stat-nbad) @@ -353,19 +396,20 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', (defun spam-stat-save () "Save the `spam-stat' hash table as lisp file." + (interactive) (with-temp-buffer (let ((standard-output (current-buffer))) - (insert "(setq spam-stat (spam-stat-to-hash-table '(") + (insert "(setq spam-stat-ngood " + (number-to-string spam-stat-ngood) + " spam-stat-nbad " + (number-to-string spam-stat-nbad) + " spam-stat (spam-stat-to-hash-table '(") (maphash (lambda (word entry) (prin1 (list word (spam-stat-good entry) (spam-stat-bad entry)))) spam-stat) - (insert ")) spam-stat-ngood " - (number-to-string spam-stat-ngood) - " spam-stat-nbad " - (number-to-string spam-stat-nbad) - ")")) + (insert ")))")) (write-file spam-stat-file))) (defun spam-stat-load () @@ -387,6 +431,14 @@ has appeared in bad mails." entries) table)) +(defun spam-stat-reset () + "Reset `spam-stat' to an empty hash-table. +This deletes all the statistics." + (interactive) + (setq spam-stat (make-hash-table :test 'equal) + spam-stat-ngood 0 + spam-stat-nbad 0)) + ;; Scoring buffers (defvar spam-stat-score-data nil @@ -397,7 +449,6 @@ has appeared in bad mails." The default score for unknown words is stored in `spam-stat-unknown-word-score'." (spam-stat-score (gethash word spam-stat))) - (defun spam-stat-buffer-words-with-scores () "Process current buffer, return the 15 most conspicuous words. @@ -433,7 +484,7 @@ check the variable `spam-stat-score-data'." (progn (set-buffer spam-stat-buffer) (goto-char (point-min)) - (when (> (spam-stat-score-buffer) 0.9) + (when (> (spam-stat-score-buffer) spam-stat-split-fancy-spam-threshhold) (when (boundp 'nnmail-split-trace) (mapc (lambda (entry) (push entry nnmail-split-trace)) @@ -454,7 +505,7 @@ check the variable `spam-stat-score-data'." (when (and (file-readable-p f) (file-regular-p f)) (setq count (1+ count)) - (message "Reading %.2f%%" (/ count max)) + (message "Reading %s: %.2f%%" dir (/ count max)) (insert-file-contents f) (funcall func) (erase-buffer)))))) @@ -500,24 +551,19 @@ You can use this to determine error rates." ;; Shrinking the dictionary -(defun spam-stat-reduce-size (&optional count distance) +(defun spam-stat-reduce-size (&optional count) "Reduce the size of `spam-stat'. This removes all words that occur less than COUNT from the dictionary. -COUNT defaults to 5. It also removes all words whose spam score -is less than DISTANCE from 0.5. DISTANCE defaults to 0.1, meaning that -all words with score between 0.4 and 0.6 are removed." - (setq count (or count 5) - distance (or distance 0.1)) +COUNT defaults to 5" + (interactive) + (setq count (or count 5)) (maphash (lambda (key entry) - (when (or (< (+ (spam-stat-good entry) - (spam-stat-bad entry)) - count) - (< (abs (- (spam-stat-score entry) 0.5)) - distance)) + (when (< (+ (spam-stat-good entry) + (spam-stat-bad entry)) + count) (remhash key spam-stat))) spam-stat)) (provide 'spam-stat) ;;; spam-stat.el ends here -