X-Git-Url: http://cgit.sxemacs.org/?a=blobdiff_plain;f=lisp%2Fspam-stat.el;h=ec8b9f3ca7d280e83ddef1f3289612c9f0222e8b;hb=f47436b1311c86cdbf072f6212f3153ef2a35a43;hp=97aa160473689f421c1206aa177071ece55a13bf;hpb=6b995a4cc29d045bd3d696a3646ed7e343f3daaf;p=gnus diff --git a/lisp/spam-stat.el b/lisp/spam-stat.el index 97aa16047..ec8b9f3ca 100644 --- a/lisp/spam-stat.el +++ b/lisp/spam-stat.el @@ -1,6 +1,6 @@ ;;; spam-stat.el --- detecting spam based on statistics -;; Copyright (C) 2002, 2003 Free Software Foundation, Inc. +;; Copyright (C) 2002-2014 Free Software Foundation, Inc. ;; Author: Alex Schroeder ;; Keywords: network @@ -8,20 +8,18 @@ ;; This file is part of GNU Emacs. -;; This is free software; you can redistribute it and/or modify it -;; under the terms of the GNU General Public License as published by -;; the Free Software Foundation; either version 2, or (at your option) -;; any later version. +;; GNU Emacs is free software: you can redistribute it and/or modify +;; it under the terms of the GNU General Public License as published by +;; the Free Software Foundation, either version 3 of the License, or +;; (at your option) any later version. -;; This is distributed in the hope that it will be useful, but WITHOUT -;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public -;; License for more details. +;; GNU Emacs is distributed in the hope that it will be useful, +;; but WITHOUT ANY WARRANTY; without even the implied warranty of +;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +;; GNU General Public License for more details. ;; You should have received a copy of the GNU General Public License -;; along with GNU Emacs; see the file COPYING. If not, write to the -;; Free Software Foundation, Inc., 59 Temple Place - Suite 330, -;; Boston, MA 02111-1307, USA. +;; along with GNU Emacs. If not, see . ;;; Commentary: @@ -122,12 +120,16 @@ ;;; Code: +(require 'mail-parse) + +(defvar gnus-original-article-buffer) (defgroup spam-stat nil "Statistical spam detection for Emacs. Use the functions to build a dictionary of words and their statistical distribution in spam and non-spam mails. Then use a function to determine whether a buffer contains spam or not." + :version "22.1" :group 'gnus) (defcustom spam-stat-file "~/.spam-stat.el" @@ -136,12 +138,6 @@ See `spam-stat-to-hash-table' for the format of the file." :type 'file :group 'spam-stat) -(defcustom spam-stat-install-hooks t - "Whether spam-stat should install its hooks in Gnus. -This is set to nil if you use spam-stat through spam.el." - :type 'boolean - :group 'spam-stat) - (defcustom spam-stat-unknown-word-score 0.2 "The score to use for unknown words. Also used for words that don't appear often enough." @@ -160,17 +156,53 @@ This variable says how many characters this will be." :group 'spam-stat) (defcustom spam-stat-split-fancy-spam-group "mail.spam" - "Name of the group where spam should be stored, if -`spam-stat-split-fancy' is used in fancy splitting rules. Has no -effect when spam-stat is invoked through spam.el." + "Name of the group where spam should be stored. +If `spam-stat-split-fancy' is used in fancy splitting rules. Has +no effect when spam-stat is invoked through spam.el." :type 'string :group 'spam-stat) -(defcustom spam-stat-split-fancy-spam-threshhold 0.9 - "Spam score threshhold in spam-stat-split-fancy." +(defcustom spam-stat-split-fancy-spam-threshold 0.9 + "Spam score threshold in spam-stat-split-fancy." :type 'number :group 'spam-stat) +(defcustom spam-stat-washing-hook nil + "Hook applied to each message before analysis." + :type 'hook + :group 'spam-stat) + +(defcustom spam-stat-score-buffer-user-functions nil + "List of additional scoring functions. +Called one by one on the buffer. + +If all of these functions return non-nil answers, these numerical +answers are added to the computed spam stat score on the buffer. If +you defun such functions, make sure they don't return the buffer in a +narrowed state or such: use, for example, `save-excursion'. Each of +your functions is also passed the initial spam-stat score which might +aid in your scoring. + +Also be careful when defining such functions. If they take a long +time, they will slow down your mail splitting. Thus, if the buffer is +large, don't forget to use smaller regions, by wrapping your work in, +say, `with-spam-stat-max-buffer-size'." + :type '(repeat sexp) + :group 'spam-stat) + +(defcustom spam-stat-process-directory-age 90 + "Max. age of files to be processed in directory, in days. +When using `spam-stat-process-spam-directory' or +`spam-stat-process-non-spam-directory', only files that have +been touched in this many days will be considered. Without +this filter, re-training spam-stat with several thousand messages +will start to take a very long time." + :type 'number + :group 'spam-stat) + +(defvar spam-stat-last-saved-at nil + "Time stamp of last change of spam-stat-file on this run") + (defvar spam-stat-syntax-table (let ((table (copy-syntax-table text-mode-syntax-table))) (modify-syntax-entry ?- "w" table) @@ -193,22 +225,24 @@ This is set by hooking into Gnus.") (defvar spam-stat-buffer-name " *spam stat buffer*" "Name of the `spam-stat-buffer'.") +(defvar spam-stat-coding-system + (if (mm-coding-system-p 'emacs-mule) 'emacs-mule 'raw-text) + "Coding system used for `spam-stat-file'.") + ;; Hooking into Gnus (defun spam-stat-store-current-buffer () "Store a copy of the current buffer in `spam-stat-buffer'." - (save-excursion - (let ((str (buffer-string))) - (set-buffer (get-buffer-create spam-stat-buffer-name)) + (let ((buf (current-buffer))) + (with-current-buffer (get-buffer-create spam-stat-buffer-name) (erase-buffer) - (insert str) + (insert-buffer-substring buf) (setq spam-stat-buffer (current-buffer))))) (defun spam-stat-store-gnus-article-buffer () "Store a copy of the current article in `spam-stat-buffer'. This uses `gnus-article-buffer'." - (save-excursion - (set-buffer gnus-original-article-buffer) + (with-current-buffer gnus-original-article-buffer (spam-stat-store-current-buffer))) ;; Data -- not using defstruct in order to save space and time @@ -226,6 +260,9 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', (defvar spam-stat-nbad 0 "The number of bad mails in the dictionary.") +(defvar spam-stat-error-holder nil + "A holder for condition-case errors while scoring buffers.") + (defsubst spam-stat-good (entry) "Return the number of times this word belongs to good mails." (aref entry 0)) @@ -280,7 +317,7 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', ;; Parsing (defmacro with-spam-stat-max-buffer-size (&rest body) - "Narrows the buffer down to the first 4k characters, then evaluates BODY." + "Narrow the buffer down to the first 4k characters, then evaluate BODY." `(save-restriction (when (> (- (point-max) (point-min)) @@ -290,7 +327,8 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', ,@body)) (defun spam-stat-buffer-words () - "Return a hash table of words and number of occurences in the buffer." + "Return a hash table of words and number of occurrences in the buffer." + (run-hooks 'spam-stat-washing-hook) (with-spam-stat-max-buffer-size (with-syntax-table spam-stat-syntax-table (goto-char (point-min)) @@ -331,6 +369,8 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', (spam-stat-buffer-words)) (setq spam-stat-dirty t)) +(autoload 'gnus-message "gnus-util") + (defun spam-stat-buffer-change-to-spam () "Consider current buffer no longer normal mail but spam." (setq spam-stat-nbad (1+ spam-stat-nbad) @@ -339,7 +379,7 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', (lambda (word count) (let ((entry (gethash word spam-stat))) (if (not entry) - (error "This buffer has unknown words in it.") + (gnus-message 8 "This buffer has unknown words in it") (spam-stat-set-good entry (- (spam-stat-good entry) count)) (spam-stat-set-bad entry (+ (spam-stat-bad entry) count)) (spam-stat-set-score entry (spam-stat-compute-score entry)) @@ -355,7 +395,7 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', (lambda (word count) (let ((entry (gethash word spam-stat))) (if (not entry) - (error "This buffer has unknown words in it.") + (gnus-message 8 "This buffer has unknown words in it") (spam-stat-set-good entry (+ (spam-stat-good entry) count)) (spam-stat-set-bad entry (- (spam-stat-bad entry) count)) (spam-stat-set-score entry (spam-stat-compute-score entry)) @@ -366,31 +406,41 @@ Use `spam-stat-ngood', `spam-stat-nbad', `spam-stat-good', ;; Saving and Loading (defun spam-stat-save (&optional force) - "Save the `spam-stat' hash table as lisp file." - (interactive) + "Save the `spam-stat' hash table as lisp file. +With a prefix argument save unconditionally." + (interactive "P") (when (or force spam-stat-dirty) - (with-temp-buffer - (let ((standard-output (current-buffer)) - (font-lock-maximum-size 0)) - (insert "(setq spam-stat-ngood " - (number-to-string spam-stat-ngood) - " spam-stat-nbad " - (number-to-string spam-stat-nbad) - " spam-stat (spam-stat-to-hash-table '(") - (maphash (lambda (word entry) - (prin1 (list word - (spam-stat-good entry) - (spam-stat-bad entry)))) - spam-stat) - (insert ")))") - (write-file spam-stat-file))) - (setq spam-stat-dirty nil))) + (let ((coding-system-for-write spam-stat-coding-system)) + (with-temp-file spam-stat-file + (let ((standard-output (current-buffer))) + (insert (format ";-*- coding: %s; -*-\n" spam-stat-coding-system)) + (insert (format "(setq spam-stat-ngood %d spam-stat-nbad %d +spam-stat (spam-stat-to-hash-table '(" spam-stat-ngood spam-stat-nbad)) + (maphash (lambda (word entry) + (prin1 (list word + (spam-stat-good entry) + (spam-stat-bad entry)))) + spam-stat) + (insert ")))")))) + (message "Saved %s." spam-stat-file) + (setq spam-stat-dirty nil + spam-stat-last-saved-at (nth 5 (file-attributes spam-stat-file))))) (defun spam-stat-load () "Read the `spam-stat' hash table from disk." ;; TODO: maybe we should warn the user if spam-stat-dirty is t? - (load-file spam-stat-file) - (setq spam-stat-dirty nil)) + (let ((coding-system-for-read spam-stat-coding-system)) + (cond (spam-stat-dirty (message "Spam stat not loaded: spam-stat-dirty t")) + ((or (not (boundp 'spam-stat-last-saved-at)) + (null spam-stat-last-saved-at) + (not (equal spam-stat-last-saved-at + (nth 5 (file-attributes spam-stat-file))))) + (progn + (load-file spam-stat-file) + (setq spam-stat-dirty nil + spam-stat-last-saved-at + (nth 5 (file-attributes spam-stat-file))))) + (t (message "Spam stat file not loaded: no change in disk."))))) (defun spam-stat-to-hash-table (entries) "Turn list ENTRIES into a hash table and store as `spam-stat'. @@ -433,46 +483,87 @@ The default score for unknown words is stored in These are the words whose spam-stat differs the most from 0.5. The list returned contains elements of the form \(WORD SCORE DIFF), where DIFF is the difference between SCORE and 0.5." - (with-spam-stat-max-buffer-size - (with-syntax-table spam-stat-syntax-table - (let (result word score) - (maphash (lambda (word ignore) - (setq score (spam-stat-score-word word) - result (cons (list word score (abs (- score 0.5))) - result))) - (spam-stat-buffer-words)) - (setq result (sort result (lambda (a b) (< (nth 2 b) (nth 2 a))))) - (setcdr (nthcdr 14 result) nil) - result)))) + (let (result word score) + (maphash (lambda (word ignore) + (setq score (spam-stat-score-word word) + result (cons (list word score (abs (- score 0.5))) + result))) + (spam-stat-buffer-words)) + (setq result (sort result (lambda (a b) (< (nth 2 b) (nth 2 a))))) + (setcdr (nthcdr 14 result) nil) + result)) + +(eval-when-compile + (defmacro spam-stat-called-interactively-p (kind) + (condition-case nil + (progn + (eval '(called-interactively-p 'any)) + ;; Emacs >=23.2 + `(called-interactively-p ,kind)) + ;; Emacs <23.2 + (wrong-number-of-arguments '(called-interactively-p)) + ;; XEmacs + (void-function '(interactive-p))))) (defun spam-stat-score-buffer () - "Return a score describing the spam-probability for this buffer." + "Return a score describing the spam-probability for this buffer. +Add user supplied modifications if supplied." + (interactive) ; helps in debugging. (setq spam-stat-score-data (spam-stat-buffer-words-with-scores)) - (let* ((probs (mapcar (lambda (e) (cadr e)) spam-stat-score-data)) - (prod (apply #'* probs))) - (/ prod (+ prod (apply #'* (mapcar #'(lambda (x) (- 1 x)) - probs)))))) + (let* ((probs (mapcar 'cadr spam-stat-score-data)) + (prod (apply #'* probs)) + (score0 + (/ prod (+ prod (apply #'* (mapcar #'(lambda (x) (- 1 x)) + probs))))) + (score1s + (condition-case + spam-stat-error-holder + (spam-stat-score-buffer-user score0) + (error nil))) + (ans + (if score1s (+ score0 score1s) score0))) + (when (spam-stat-called-interactively-p 'any) + (message "%S" ans)) + ans)) + +(defun spam-stat-score-buffer-user (&rest args) + (let* ((scores + (mapcar + (lambda (fn) + (apply fn args)) + spam-stat-score-buffer-user-functions))) + (if (memq nil scores) nil + (apply #'+ scores)))) (defun spam-stat-split-fancy () "Return the name of the spam group if the current mail is spam. Use this function on `nnmail-split-fancy'. If you are interested in the raw data used for the last run of `spam-stat-score-buffer', check the variable `spam-stat-score-data'." - (condition-case var + (condition-case spam-stat-error-holder (progn (set-buffer spam-stat-buffer) (goto-char (point-min)) - (when (> (spam-stat-score-buffer) spam-stat-split-fancy-spam-threshhold) + (when (> (spam-stat-score-buffer) spam-stat-split-fancy-spam-threshold) (when (boundp 'nnmail-split-trace) (mapc (lambda (entry) (push entry nnmail-split-trace)) spam-stat-score-data)) spam-stat-split-fancy-spam-group)) - (error (message "Error in spam-stat-split-fancy: %S" var) + (error (message "Error in spam-stat-split-fancy: %S" spam-stat-error-holder) nil))) ;; Testing +(defun spam-stat-strip-xref () + "Strip the Xref header." + (save-restriction + (mail-narrow-to-head) + (when (re-search-forward "^Xref:.*\n" nil t) + (delete-region (match-beginning 0) (match-end 0))))) + +(autoload 'time-to-number-of-days "time-date") + (defun spam-stat-process-directory (dir func) "Process all the regular files in directory DIR using function FUNC." (let* ((files (directory-files dir t "^[^.]")) @@ -482,10 +573,13 @@ check the variable `spam-stat-score-data'." (dolist (f files) (when (and (file-readable-p f) (file-regular-p f) - (> (nth 7 (file-attributes f)) 0)) + (> (nth 7 (file-attributes f)) 0) + (< (time-to-number-of-days (time-since (nth 5 (file-attributes f)))) + spam-stat-process-directory-age)) (setq count (1+ count)) (message "Reading %s: %.2f%%" dir (/ count max)) - (insert-file-contents f) + (insert-file-contents-literally f) + (spam-stat-strip-xref) (funcall func) (erase-buffer)))))) @@ -529,7 +623,7 @@ display non-spam files; otherwise display spam files." (setq count (1+ count)) (message "Reading %.2f%%, score %.2f" (/ count max) (/ score count)) - (insert-file-contents f) + (insert-file-contents-literally f) (setq buffer-score (spam-stat-score-buffer)) (when (> buffer-score 0.9) (setq score (1+ score))) @@ -558,27 +652,27 @@ COUNT defaults to 5" (spam-stat-bad entry)) count) (remhash key spam-stat))) - spam-stat)) + spam-stat) + (setq spam-stat-dirty t)) (defun spam-stat-install-hooks-function () - "Install the spam-stat function hooks" + "Install the spam-stat function hooks." (interactive) (add-hook 'nnmail-prepare-incoming-message-hook 'spam-stat-store-current-buffer) (add-hook 'gnus-select-article-hook 'spam-stat-store-gnus-article-buffer)) -(when spam-stat-install-hooks - (spam-stat-install-hooks-function)) - (defun spam-stat-unload-hook () - "Uninstall the spam-stat function hooks" + "Uninstall the spam-stat function hooks." (interactive) (remove-hook 'nnmail-prepare-incoming-message-hook 'spam-stat-store-current-buffer) (remove-hook 'gnus-select-article-hook 'spam-stat-store-gnus-article-buffer)) +(add-hook 'spam-stat-unload-hook 'spam-stat-unload-hook) + (provide 'spam-stat) ;;; spam-stat.el ends here