X-Git-Url: http://cgit.sxemacs.org/?a=blobdiff_plain;f=lisp%2Fspam-stat.el;h=bba1c2a70e832a99e396ce9ef177ff5c7f215e99;hb=17f456f97c1659bcdc5935b91a011cdad96b9150;hp=451c0857f9eef313144da5bcd326700ff8304f26;hpb=48d87eb631c741b3a0343a6ac795dd752983dfea;p=gnus diff --git a/lisp/spam-stat.el b/lisp/spam-stat.el index 451c0857f..bba1c2a70 100644 --- a/lisp/spam-stat.el +++ b/lisp/spam-stat.el @@ -1,6 +1,6 @@ ;;; spam-stat.el --- detecting spam based on statistics -;; Copyright (C) 2002, 2003 Free Software Foundation, Inc. +;; Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007 Free Software Foundation, Inc. ;; Author: Alex Schroeder ;; Keywords: network @@ -10,7 +10,7 @@ ;; This is free software; you can redistribute it and/or modify it ;; under the terms of the GNU General Public License as published by -;; the Free Software Foundation; either version 2, or (at your option) +;; the Free Software Foundation; either version 3, or (at your option) ;; any later version. ;; This is distributed in the hope that it will be useful, but WITHOUT @@ -20,8 +20,8 @@ ;; You should have received a copy of the GNU General Public License ;; along with GNU Emacs; see the file COPYING. If not, write to the -;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, -;; Boston, MA 02111-1307, USA. +;; Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, +;; Boston, MA 02110-1301, USA. ;;; Commentary: @@ -124,11 +124,14 @@ ;;; Code: (require 'mail-parse) +(defvar gnus-original-article-buffer) + (defgroup spam-stat nil "Statistical spam detection for Emacs. Use the functions to build a dictionary of words and their statistical distribution in spam and non-spam mails. Then use a function to determine whether a buffer contains spam or not." + :version "22.1" :group 'gnus) (defcustom spam-stat-file "~/.spam-stat.el" @@ -161,17 +164,53 @@ This variable says how many characters this will be." :group 'spam-stat) (defcustom spam-stat-split-fancy-spam-group "mail.spam" - "Name of the group where spam should be stored, if -`spam-stat-split-fancy' is used in fancy splitting rules. Has no -effect when spam-stat is invoked through spam.el." + "Name of the group where spam should be stored. +If `spam-stat-split-fancy' is used in fancy splitting rules. Has +no effect when spam-stat is invoked through spam.el." :type 'string :group 'spam-stat) -(defcustom spam-stat-split-fancy-spam-threshhold 0.9 - "Spam score threshhold in spam-stat-split-fancy." +(defcustom spam-stat-split-fancy-spam-threshold 0.9 + "Spam score threshold in spam-stat-split-fancy." + :type 'number + :group 'spam-stat) + +(defcustom spam-stat-washing-hook nil + "Hook applied to each message before analysis." + :type 'hook + :group 'spam-stat) + +(defcustom spam-stat-score-buffer-user-functions nil + "List of additional scoring functions. +Called one by one on the buffer. + +If all of these functions return non-nil answers, these numerical +answers are added to the computed spam stat score on the buffer. If +you defun such functions, make sure they don't return the buffer in a +narrowed state or such: use, for example, `save-excursion'. Each of +your functions is also passed the initial spam-stat score which might +aid in your scoring. + +Also be careful when defining such functions. If they take a long +time, they will slow down your mail splitting. Thus, if the buffer is +large, don't forget to use smaller regions, by wrapping your work in, +say, `with-spam-stat-max-buffer-size'." + :type '(repeat sexp) + :group 'spam-stat) + +(defcustom spam-stat-process-directory-age 90 + "Max. age of files to be processed in directory, in days. +When using `spam-stat-process-spam-directory' or +`spam-stat-process-non-spam-directory', only files that have +been touched in this many days will be considered. Without +this filter, re-training spam-stat with several thousand messages +will start to take a very long time." :type 'number :group 'spam-stat) +(defvar spam-stat-last-saved-at nil + "Time stamp of last change of spam-stat-file on this run") + (defvar spam-stat-syntax-table (let ((table (copy-syntax-table text-mode-syntax-table))) (modify-syntax-entry ?- "w" table) @@ -194,6 +233,10 @@ This is set by hooking into Gnus.") (defvar spam-stat-buffer-name " *spam stat buffer*" "Name of the `spam-stat-buffer'.") +(defvar spam-stat-coding-system + (if (mm-coding-system-p 'emacs-mule) 'emacs-mule 'raw-text) + "Coding system used for `spam-stat-file'.") + ;; Hooking into Gnus (defun spam-stat-store-current-buffer () @@ -225,6 +268,9 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', (defvar spam-stat-nbad 0 "The number of bad mails in the dictionary.") +(defvar spam-stat-error-holder nil + "A holder for condition-case errors while scoring buffers.") + (defsubst spam-stat-good (entry) "Return the number of times this word belongs to good mails." (aref entry 0)) @@ -279,7 +325,7 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', ;; Parsing (defmacro with-spam-stat-max-buffer-size (&rest body) - "Narrows the buffer down to the first 4k characters, then evaluates BODY." + "Narrow the buffer down to the first 4k characters, then evaluate BODY." `(save-restriction (when (> (- (point-max) (point-min)) @@ -289,7 +335,8 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', ,@body)) (defun spam-stat-buffer-words () - "Return a hash table of words and number of occurences in the buffer." + "Return a hash table of words and number of occurrences in the buffer." + (run-hooks 'spam-stat-washing-hook) (with-spam-stat-max-buffer-size (with-syntax-table spam-stat-syntax-table (goto-char (point-min)) @@ -330,6 +377,8 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', (spam-stat-buffer-words)) (setq spam-stat-dirty t)) +(autoload 'gnus-message "gnus-util") + (defun spam-stat-buffer-change-to-spam () "Consider current buffer no longer normal mail but spam." (setq spam-stat-nbad (1+ spam-stat-nbad) @@ -338,7 +387,7 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', (lambda (word count) (let ((entry (gethash word spam-stat))) (if (not entry) - (error "This buffer has unknown words in it.") + (gnus-message 8 "This buffer has unknown words in it") (spam-stat-set-good entry (- (spam-stat-good entry) count)) (spam-stat-set-bad entry (+ (spam-stat-bad entry) count)) (spam-stat-set-score entry (spam-stat-compute-score entry)) @@ -354,7 +403,7 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', (lambda (word count) (let ((entry (gethash word spam-stat))) (if (not entry) - (error "This buffer has unknown words in it.") + (gnus-message 8 "This buffer has unknown words in it") (spam-stat-set-good entry (+ (spam-stat-good entry) count)) (spam-stat-set-bad entry (- (spam-stat-bad entry) count)) (spam-stat-set-score entry (spam-stat-compute-score entry)) @@ -369,28 +418,38 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', With a prefix argument save unconditionally." (interactive "P") (when (or force spam-stat-dirty) - (with-temp-buffer - (let ((standard-output (current-buffer)) - (font-lock-maximum-size 0)) - (insert "(setq spam-stat-ngood " - (number-to-string spam-stat-ngood) - " spam-stat-nbad " - (number-to-string spam-stat-nbad) - " spam-stat (spam-stat-to-hash-table '(") - (maphash (lambda (word entry) - (prin1 (list word - (spam-stat-good entry) - (spam-stat-bad entry)))) - spam-stat) - (insert ")))") - (write-file spam-stat-file))) - (setq spam-stat-dirty nil))) + (let ((coding-system-for-write spam-stat-coding-system)) + (with-temp-file spam-stat-file + (let ((standard-output (current-buffer)) + (font-lock-maximum-size 0)) + (insert (format ";-*- coding: %s; -*-\n" spam-stat-coding-system)) + (insert (format "(setq spam-stat-ngood %d spam-stat-nbad %d +spam-stat (spam-stat-to-hash-table '(" spam-stat-ngood spam-stat-nbad)) + (maphash (lambda (word entry) + (prin1 (list word + (spam-stat-good entry) + (spam-stat-bad entry)))) + spam-stat) + (insert ")))")))) + (message "Saved %s." spam-stat-file) + (setq spam-stat-dirty nil + spam-stat-last-saved-at (nth 5 (file-attributes spam-stat-file))))) (defun spam-stat-load () "Read the `spam-stat' hash table from disk." ;; TODO: maybe we should warn the user if spam-stat-dirty is t? - (load-file spam-stat-file) - (setq spam-stat-dirty nil)) + (let ((coding-system-for-read spam-stat-coding-system)) + (cond (spam-stat-dirty (message "Spam stat not loaded: spam-stat-dirty t")) + ((or (not (boundp 'spam-stat-last-saved-at)) + (null spam-stat-last-saved-at) + (not (equal spam-stat-last-saved-at + (nth 5 (file-attributes spam-stat-file))))) + (progn + (load-file spam-stat-file) + (setq spam-stat-dirty nil + spam-stat-last-saved-at + (nth 5 (file-attributes spam-stat-file))))) + (t (message "Spam stat file not loaded: no change in disk.."))))) (defun spam-stat-to-hash-table (entries) "Turn list ENTRIES into a hash table and store as `spam-stat'. @@ -444,29 +503,51 @@ where DIFF is the difference between SCORE and 0.5." result)) (defun spam-stat-score-buffer () - "Return a score describing the spam-probability for this buffer." + "Return a score describing the spam-probability for this buffer. +Add user supplied modifications if supplied." + (interactive) ; helps in debugging. (setq spam-stat-score-data (spam-stat-buffer-words-with-scores)) - (let* ((probs (mapcar (lambda (e) (cadr e)) spam-stat-score-data)) - (prod (apply #'* probs))) - (/ prod (+ prod (apply #'* (mapcar #'(lambda (x) (- 1 x)) - probs)))))) + (let* ((probs (mapcar 'cadr spam-stat-score-data)) + (prod (apply #'* probs)) + (score0 + (/ prod (+ prod (apply #'* (mapcar #'(lambda (x) (- 1 x)) + probs))))) + (score1s + (condition-case + spam-stat-error-holder + (spam-stat-score-buffer-user score0) + (error nil))) + (ans + (if score1s (+ score0 score1s) score0))) + (when (interactive-p) + (message "%S" ans)) + ans)) + +(defun spam-stat-score-buffer-user (&rest args) + (let* ((scores + (mapcar + (lambda (fn) + (apply fn args)) + spam-stat-score-buffer-user-functions))) + (if (memq nil scores) nil + (apply #'+ scores)))) (defun spam-stat-split-fancy () "Return the name of the spam group if the current mail is spam. Use this function on `nnmail-split-fancy'. If you are interested in the raw data used for the last run of `spam-stat-score-buffer', check the variable `spam-stat-score-data'." - (condition-case var + (condition-case spam-stat-error-holder (progn (set-buffer spam-stat-buffer) (goto-char (point-min)) - (when (> (spam-stat-score-buffer) spam-stat-split-fancy-spam-threshhold) + (when (> (spam-stat-score-buffer) spam-stat-split-fancy-spam-threshold) (when (boundp 'nnmail-split-trace) (mapc (lambda (entry) (push entry nnmail-split-trace)) spam-stat-score-data)) spam-stat-split-fancy-spam-group)) - (error (message "Error in spam-stat-split-fancy: %S" var) + (error (message "Error in spam-stat-split-fancy: %S" spam-stat-error-holder) nil))) ;; Testing @@ -487,10 +568,12 @@ check the variable `spam-stat-score-data'." (dolist (f files) (when (and (file-readable-p f) (file-regular-p f) - (> (nth 7 (file-attributes f)) 0)) + (> (nth 7 (file-attributes f)) 0) + (< (time-to-number-of-days (time-since (nth 5 (file-attributes f)))) + spam-stat-process-directory-age)) (setq count (1+ count)) (message "Reading %s: %.2f%%" dir (/ count max)) - (insert-file-contents f) + (insert-file-contents-literally f) (spam-stat-strip-xref) (funcall func) (erase-buffer)))))) @@ -535,7 +618,7 @@ display non-spam files; otherwise display spam files." (setq count (1+ count)) (message "Reading %.2f%%, score %.2f" (/ count max) (/ score count)) - (insert-file-contents f) + (insert-file-contents-literally f) (setq buffer-score (spam-stat-score-buffer)) (when (> buffer-score 0.9) (setq score (1+ score))) @@ -568,7 +651,7 @@ COUNT defaults to 5" (setq spam-stat-dirty t)) (defun spam-stat-install-hooks-function () - "Install the spam-stat function hooks" + "Install the spam-stat function hooks." (interactive) (add-hook 'nnmail-prepare-incoming-message-hook 'spam-stat-store-current-buffer) @@ -579,13 +662,16 @@ COUNT defaults to 5" (spam-stat-install-hooks-function)) (defun spam-stat-unload-hook () - "Uninstall the spam-stat function hooks" + "Uninstall the spam-stat function hooks." (interactive) (remove-hook 'nnmail-prepare-incoming-message-hook 'spam-stat-store-current-buffer) (remove-hook 'gnus-select-article-hook 'spam-stat-store-gnus-article-buffer)) +(add-hook 'spam-stat-unload-hook 'spam-stat-unload-hook) + (provide 'spam-stat) +;;; arch-tag: ff1d2200-8ddb-42fb-bb7b-1b5e20448554 ;;; spam-stat.el ends here