X-Git-Url: https://cgit.sxemacs.org/?a=blobdiff_plain;f=lisp%2Fspam-stat.el;h=d6b20df78b8f1da09dca43339a65da80b1e2a629;hb=fcd99973cce5d0c0e07b280711c2514815d3b50f;hp=2b0d21eed977904b8a13f71bff6f456f142a1e1c;hpb=256e47c463e4af7c23a4e6d3f3b05329af96add2;p=gnus diff --git a/lisp/spam-stat.el b/lisp/spam-stat.el index 2b0d21eed..d6b20df78 100644 --- a/lisp/spam-stat.el +++ b/lisp/spam-stat.el @@ -1,6 +1,7 @@ ;;; spam-stat.el --- detecting spam based on statistics -;; Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc. +;; Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, +;; 2010 Free Software Foundation, Inc. ;; Author: Alex Schroeder ;; Keywords: network @@ -8,20 +9,18 @@ ;; This file is part of GNU Emacs. -;; This is free software; you can redistribute it and/or modify it -;; under the terms of the GNU General Public License as published by -;; the Free Software Foundation; either version 2, or (at your option) -;; any later version. +;; GNU Emacs is free software: you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. -;; This is distributed in the hope that it will be useful, but WITHOUT -;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public -;; License for more details. +;; GNU Emacs is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. ;; You should have received a copy of the GNU General Public License -;; along with GNU Emacs; see the file COPYING. If not, write to the -;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, -;; Boston, MA 02111-1307, USA. +;; along with GNU Emacs. If not, see . ;;; Commentary: @@ -124,11 +123,14 @@ ;;; Code: (require 'mail-parse) +(defvar gnus-original-article-buffer) + (defgroup spam-stat nil "Statistical spam detection for Emacs. Use the functions to build a dictionary of words and their statistical distribution in spam and non-spam mails. Then use a function to determine whether a buffer contains spam or not." + :version "22.1" :group 'gnus) (defcustom spam-stat-file "~/.spam-stat.el" @@ -161,14 +163,14 @@ This variable says how many characters this will be." :group 'spam-stat) (defcustom spam-stat-split-fancy-spam-group "mail.spam" - "Name of the group where spam should be stored, if -`spam-stat-split-fancy' is used in fancy splitting rules. Has no -effect when spam-stat is invoked through spam.el." + "Name of the group where spam should be stored. +If `spam-stat-split-fancy' is used in fancy splitting rules. Has +no effect when spam-stat is invoked through spam.el." :type 'string :group 'spam-stat) -(defcustom spam-stat-split-fancy-spam-threshhold 0.9 - "Spam score threshhold in spam-stat-split-fancy." +(defcustom spam-stat-split-fancy-spam-threshold 0.9 + "Spam score threshold in spam-stat-split-fancy." :type 'number :group 'spam-stat) @@ -177,13 +179,36 @@ effect when spam-stat is invoked through spam.el." :type 'hook :group 'spam-stat) +(defcustom spam-stat-score-buffer-user-functions nil + "List of additional scoring functions. +Called one by one on the buffer. + +If all of these functions return non-nil answers, these numerical +answers are added to the computed spam stat score on the buffer. If +you defun such functions, make sure they don't return the buffer in a +narrowed state or such: use, for example, `save-excursion'. Each of +your functions is also passed the initial spam-stat score which might +aid in your scoring. + +Also be careful when defining such functions. If they take a long +time, they will slow down your mail splitting. Thus, if the buffer is +large, don't forget to use smaller regions, by wrapping your work in, +say, `with-spam-stat-max-buffer-size'." + :type '(repeat sexp) + :group 'spam-stat) + (defcustom spam-stat-process-directory-age 90 "Max. age of files to be processed in directory, in days. When using `spam-stat-process-spam-directory' or `spam-stat-process-non-spam-directory', only files that have been touched in this many days will be considered. Without this filter, re-training spam-stat with several thousand messages -will start to take a very long time.") +will start to take a very long time." + :type 'number + :group 'spam-stat) + +(defvar spam-stat-last-saved-at nil + "Time stamp of last change of spam-stat-file on this run") (defvar spam-stat-syntax-table (let ((table (copy-syntax-table text-mode-syntax-table))) @@ -208,7 +233,7 @@ This is set by hooking into Gnus.") "Name of the `spam-stat-buffer'.") (defvar spam-stat-coding-system - (if (coding-system-p 'emacs-mule) 'emacs-mule 'raw-text) + (if (mm-coding-system-p 'emacs-mule) 'emacs-mule 'raw-text) "Coding system used for `spam-stat-file'.") ;; Hooking into Gnus @@ -242,6 +267,9 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', (defvar spam-stat-nbad 0 "The number of bad mails in the dictionary.") +(defvar spam-stat-error-holder nil + "A holder for condition-case errors while scoring buffers.") + (defsubst spam-stat-good (entry) "Return the number of times this word belongs to good mails." (aref entry 0)) @@ -296,7 +324,7 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', ;; Parsing (defmacro with-spam-stat-max-buffer-size (&rest body) - "Narrows the buffer down to the first 4k characters, then evaluates BODY." + "Narrow the buffer down to the first 4k characters, then evaluate BODY." `(save-restriction (when (> (- (point-max) (point-min)) @@ -306,7 +334,7 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', ,@body)) (defun spam-stat-buffer-words () - "Return a hash table of words and number of occurences in the buffer." + "Return a hash table of words and number of occurrences in the buffer." (run-hooks 'spam-stat-washing-hook) (with-spam-stat-max-buffer-size (with-syntax-table spam-stat-syntax-table @@ -348,6 +376,8 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', (spam-stat-buffer-words)) (setq spam-stat-dirty t)) +(autoload 'gnus-message "gnus-util") + (defun spam-stat-buffer-change-to-spam () "Consider current buffer no longer normal mail but spam." (setq spam-stat-nbad (1+ spam-stat-nbad) @@ -356,7 +386,7 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', (lambda (word count) (let ((entry (gethash word spam-stat))) (if (not entry) - (error "This buffer has unknown words in it.") + (gnus-message 8 "This buffer has unknown words in it") (spam-stat-set-good entry (- (spam-stat-good entry) count)) (spam-stat-set-bad entry (+ (spam-stat-bad entry) count)) (spam-stat-set-score entry (spam-stat-compute-score entry)) @@ -372,7 +402,7 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', (lambda (word count) (let ((entry (gethash word spam-stat))) (if (not entry) - (error "This buffer has unknown words in it.") + (gnus-message 8 "This buffer has unknown words in it") (spam-stat-set-good entry (+ (spam-stat-good entry) count)) (spam-stat-set-bad entry (- (spam-stat-bad entry) count)) (spam-stat-set-score entry (spam-stat-compute-score entry)) @@ -400,14 +430,25 @@ spam-stat (spam-stat-to-hash-table '(" spam-stat-ngood spam-stat-nbad)) (spam-stat-bad entry)))) spam-stat) (insert ")))")))) - (setq spam-stat-dirty nil))) + (message "Saved %s." spam-stat-file) + (setq spam-stat-dirty nil + spam-stat-last-saved-at (nth 5 (file-attributes spam-stat-file))))) (defun spam-stat-load () "Read the `spam-stat' hash table from disk." ;; TODO: maybe we should warn the user if spam-stat-dirty is t? (let ((coding-system-for-read spam-stat-coding-system)) - (load-file spam-stat-file)) - (setq spam-stat-dirty nil)) + (cond (spam-stat-dirty (message "Spam stat not loaded: spam-stat-dirty t")) + ((or (not (boundp 'spam-stat-last-saved-at)) + (null spam-stat-last-saved-at) + (not (equal spam-stat-last-saved-at + (nth 5 (file-attributes spam-stat-file))))) + (progn + (load-file spam-stat-file) + (setq spam-stat-dirty nil + spam-stat-last-saved-at + (nth 5 (file-attributes spam-stat-file))))) + (t (message "Spam stat file not loaded: no change in disk."))))) (defun spam-stat-to-hash-table (entries) "Turn list ENTRIES into a hash table and store as `spam-stat'. @@ -461,40 +502,64 @@ where DIFF is the difference between SCORE and 0.5." result)) (defun spam-stat-score-buffer () - "Return a score describing the spam-probability for this buffer." + "Return a score describing the spam-probability for this buffer. +Add user supplied modifications if supplied." + (interactive) ; helps in debugging. (setq spam-stat-score-data (spam-stat-buffer-words-with-scores)) - (let* ((probs (mapcar (lambda (e) (cadr e)) spam-stat-score-data)) - (prod (apply #'* probs))) - (/ prod (+ prod (apply #'* (mapcar #'(lambda (x) (- 1 x)) - probs)))))) + (let* ((probs (mapcar 'cadr spam-stat-score-data)) + (prod (apply #'* probs)) + (score0 + (/ prod (+ prod (apply #'* (mapcar #'(lambda (x) (- 1 x)) + probs))))) + (score1s + (condition-case + spam-stat-error-holder + (spam-stat-score-buffer-user score0) + (error nil))) + (ans + (if score1s (+ score0 score1s) score0))) + (when (interactive-p) + (message "%S" ans)) + ans)) + +(defun spam-stat-score-buffer-user (&rest args) + (let* ((scores + (mapcar + (lambda (fn) + (apply fn args)) + spam-stat-score-buffer-user-functions))) + (if (memq nil scores) nil + (apply #'+ scores)))) (defun spam-stat-split-fancy () "Return the name of the spam group if the current mail is spam. Use this function on `nnmail-split-fancy'. If you are interested in the raw data used for the last run of `spam-stat-score-buffer', check the variable `spam-stat-score-data'." - (condition-case var + (condition-case spam-stat-error-holder (progn (set-buffer spam-stat-buffer) (goto-char (point-min)) - (when (> (spam-stat-score-buffer) spam-stat-split-fancy-spam-threshhold) + (when (> (spam-stat-score-buffer) spam-stat-split-fancy-spam-threshold) (when (boundp 'nnmail-split-trace) (mapc (lambda (entry) (push entry nnmail-split-trace)) spam-stat-score-data)) spam-stat-split-fancy-spam-group)) - (error (message "Error in spam-stat-split-fancy: %S" var) + (error (message "Error in spam-stat-split-fancy: %S" spam-stat-error-holder) nil))) ;; Testing (defun spam-stat-strip-xref () - "Strip the the Xref header." + "Strip the Xref header." (save-restriction (mail-narrow-to-head) (when (re-search-forward "^Xref:.*\n" nil t) (delete-region (match-beginning 0) (match-end 0))))) +(autoload 'time-to-number-of-days "time-date") + (defun spam-stat-process-directory (dir func) "Process all the regular files in directory DIR using function FUNC." (let* ((files (directory-files dir t "^[^.]")) @@ -587,7 +652,7 @@ COUNT defaults to 5" (setq spam-stat-dirty t)) (defun spam-stat-install-hooks-function () - "Install the spam-stat function hooks" + "Install the spam-stat function hooks." (interactive) (add-hook 'nnmail-prepare-incoming-message-hook 'spam-stat-store-current-buffer) @@ -598,13 +663,15 @@ COUNT defaults to 5" (spam-stat-install-hooks-function)) (defun spam-stat-unload-hook () - "Uninstall the spam-stat function hooks" + "Uninstall the spam-stat function hooks." (interactive) (remove-hook 'nnmail-prepare-incoming-message-hook 'spam-stat-store-current-buffer) (remove-hook 'gnus-select-article-hook 'spam-stat-store-gnus-article-buffer)) +(add-hook 'spam-stat-unload-hook 'spam-stat-unload-hook) + (provide 'spam-stat) ;;; spam-stat.el ends here