1 ;;; bbdb-snarf.el -- convert free-form text to BBDB records
4 ;;; Copyright (C) 1997 by John Heidemann <johnh@isi.edu>.
5 ;;; $Id: bbdb-snarf.el,v 1.8 2007-02-23 20:24:08 fenk Exp $
7 ;;; This file is free software; you can redistribute it and/or modify
8 ;;; it under the terms of the GNU General Public License as published
9 ;;; by the Free Software Foundation version 1.
11 ;;; This file is distributed in the hope that it will be useful, but
12 ;;; WITHOUT ANY WARRANTY; without even the implied warranty of
13 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 ;;; General Public License for more details.
16 ;;; You should have received a copy of the GNU General Public License
17 ;;; along with GNU Emacs; see the file COPYING. If not, write to
18 ;;; the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22 ;;; bbdb-snarf is code to pick addresses, phones, and such out of a
23 ;;; free-form paragraphs. Things are recognized by context (web pages
24 ;;; start with http:// or www., for example). I wrote it because I
25 ;;; despise fill-in-the-blank forms (a la bbdb-create). (if I wanted
26 ;;; modes, I'd use vi :-).
28 ;;; Eventually I'd like to be able to replace bbdb-mode with a free-form
29 ;;; text mode where bbdb-snarf merges in any changes you make.
30 ;;; I'm not there yet---merging is not good enough currently.
31 ;;; Currently bbdb-snarf is good for pulling postal addresses
32 ;;; from e-mail messages and converting other databases.
40 (defconst bbdb-digit "[0-9]")
41 (defvar bbdb-snarf-phone-regexp
43 "\\(([2-9][0-9][0-9])[-. ]?\\|[2-9][0-9][0-9][-. ]\\)?"
44 "[0-9][0-9][0-9][-. ][0-9][0-9][0-9][0-9]"
45 "\\( *\\(x\\|ext\\.?\\) *[0-9]+\\)?"
47 "regexp to match phones.")
48 (defvar bbdb-snarf-zip-regexp
51 bbdb-digit bbdb-digit bbdb-digit bbdb-digit bbdb-digit
52 "\\(-" bbdb-digit bbdb-digit bbdb-digit bbdb-digit "\\)?"
54 "regexp matching zip.")
56 (defcustom bbdb-snarf-web-prop 'www
57 "What property bbdb should use for the web, or nil to not detect web URLs."
61 (defun bbdb-snarf-address-lines ()
62 (let ((lines (bbdb-split (buffer-string) "\n")))
63 (if (>= bbdb-file-format 5) nil
64 (while (< (length lines) 3)
65 (setq lines (append lines (list nil))))
66 (if (> (length lines) 3)
67 (error "bbdb-snarf-address-lines: too many lines in address.")))
68 (delete-region (point-min) (point-max))
71 (defun bbdb-snarf-make-address
72 (label address-lines city state zip country)
73 (if (>= bbdb-file-format 4)
74 (vector label address-lines city state zip country)
75 (if (>= bbdb-file-format 3)
76 (vector label address-lines city state zip)
83 (defun bbdb-snarf-prune-empty-lines ()
84 (goto-char (point-min))
85 (while (re-search-forward "^[ \t]*\n" (point-max) t)
88 (defun delete-and-return-region (begin end)
90 (buffer-substring begin end)
91 (delete-region begin end)))
93 (defun bbdb-snarf-extract-label (default consume-p)
94 "Extract the label before the point, or return DEFAULT if no label.
95 If CONSUME-P is set, delete the text, if found."
96 (interactive "sDefault label: ")
97 (let ((end (point-marker)))
98 (skip-chars-backward " \t")
99 (if (not (= (point) (point-min)))
102 (let* ((label-end (point))
103 (label (delete-and-return-region
104 (progn (skip-chars-backward "^\n,;") (point))
106 (delete-region (point) end)
110 (defun bbdb-snarf-parse-phone-number (phone)
111 "Fix the bogosity that is `bbdb-snarf-parse-phone-number'.
112 It doesn't always return a normalized phone number.
113 For (800) 555-1212 it returns a three element list."
114 (let ((try (bbdb-parse-phone-number phone)))
115 (if (= 3 (length try))
120 (defun bbdb-snarf (where)
121 "snarf up a bbdb record WHERE the point is.
122 We assume things are line-broken and paragraph-bounded.
123 The name comes first and other fields (address,
124 phone, email, web pages) are recognized by context.
127 addresses end with \"City, State ZIP\" or \"City, State\"
128 phones match bbdb-snarf-phone-regexp
129 (currently US-style phones)
130 e-mail addresses have @'s in them
131 web sites are recognized by http:// or www.
133 Address and phone context are currently US-specific;
134 patches to internationalize these assumptions are welcome.
136 \\[bbdb-snarf] is similar to \\[bbdb-whois-sentinel], but less specialized."
139 (progn (goto-char where) (forward-paragraph -1) (point))
140 (progn (forward-paragraph 1) (point))))
143 (defun bbdb-snarf-region (begin end)
144 "snarf up a bbdb record in the current region. See `bbdb-snarf' for
150 ((buf (get-buffer-create " *BBDB snarf*"))
151 (text (buffer-substring-no-properties begin end))
152 phones nets web city state zip name address-lines
153 address-vector notes)
158 ;; toss beginning and trailing space
159 (goto-char (point-min))
160 (while (re-search-forward "^[ \t]+" (point-max) t)
162 (goto-char (point-min))
163 (while (re-search-forward "^\\s +$" (point-max) t)
166 ;; first, pick out phone numbers
167 (goto-char (point-min))
168 (while (re-search-forward bbdb-snarf-phone-regexp (point-max) t)
170 (begin (match-beginning 0))
174 (if (looking-at "[0-9A-Za-z]")
175 (goto-char end);; not really phone
176 (setq phone (bbdb-snarf-parse-phone-number
177 (delete-and-return-region begin end))
178 phones (append phones
180 (list (bbdb-snarf-extract-label
181 (bbdb-label-completion-default
186 (goto-char (point-min))
187 (if (and bbdb-snarf-web-prop
188 (re-search-forward "\\(http://\\|www\.\\)[^ \t\n]+"
191 (setq web (match-string 0)
192 notes (append notes (list (cons bbdb-snarf-web-prop web))))
196 (goto-char (point-min))
197 (while (re-search-forward "[^ \t\n<]+@[^ \t\n>]+" (point-max) t)
198 (setq nets (append nets (list (match-string 0))))
201 (bbdb-snarf-prune-empty-lines)
204 (goto-char (point-min))
205 ;; This check is horribly english-centric (I think)
206 (while (and (not (eobp)) (/= (char-syntax (char-after (point))) ?w))
208 (if (re-search-forward "\\(\\sw\\|[ -\.,]\\)*\\sw" nil t)
210 (setq name (match-string 0))
211 (delete-region (match-beginning 0) (match-end 0))))
214 (goto-char (point-min))
217 ((re-search-forward bbdb-snarf-zip-regexp (point-max) t)
221 (narrow-to-region (point-min) (match-end 0))
222 (goto-char (point-max))
224 (re-search-backward bbdb-snarf-zip-regexp (point-min) t)
225 (setq zip (bbdb-parse-zip-string (match-string 0)))
227 (skip-chars-backward " \t")
229 (skip-chars-backward "^ \t,")
230 (setq state (buffer-substring (point) mk))
232 (skip-chars-backward " \t,")
235 (setq city (buffer-substring (point) mk))
238 (delete-region (point) (point-max))
240 (goto-char (point-min))
241 (setq address-lines (bbdb-snarf-address-lines)
242 address-vector (list (bbdb-snarf-make-address
243 (bbdb-label-completion-default
249 "";; FIXME: snarf country
251 ;; try for just city, state
252 ((re-search-forward "^\\(.*\\), \\([A-Z][A-Za-z]\\)$"
256 (setq city (match-string 1)
257 state (match-string 2))
258 (narrow-to-region (point-min) (match-end 0))
259 (goto-char (point-min))
260 (setq address-lines (bbdb-snarf-address-lines)
261 address-vector (list (bbdb-snarf-make-address
267 "";; FIXME: snarf country
270 (setq address-lines '(nil nil nil)
271 address-vector nil)))
273 ;; anything else -> notes
274 (bbdb-snarf-prune-empty-lines)
275 (if (/= (point-min) (point-max))
276 (setq notes (append notes (list (cons 'notes (buffer-string))))))
279 ; (goto-char (point-max))
283 ; "state: " state "\n"
287 (and nets (car (car (bbdb-rfc822-addresses (car nets)))))
290 (bbdb-merge-interactively name
298 ; (setq bbdb-snarf-test-cases "
300 ; another test person
302 ; Los Angeles, CA 91342
305 ; http://www.foo.bar/
306 ; other stuff about this person
310 ; St. Los Angeles, CA 91342-1234
316 ; Los Angeles, California 91342-1234
330 (defun bbdb-merge-interactively (name company nets addrs phones notes)
331 "Interactively add a new record; arguments same as \\[bbdb-create-internal]."
333 ((f-l-name (bbdb-divide-name name))
334 (firstname (car f-l-name))
335 (lastname (nth 1 f-l-name))
338 (vector firstname lastname aka company phones addrs
339 (if (listp nets) nets (list nets)) notes
340 (make-vector bbdb-cache-length nil)))
341 (old-record (bbdb-search-simple name nets)))
344 (setq new-record (bbdb-merge-internally old-record new-record))
345 (bbdb-delete-record-internal old-record)))
347 (bbdb-invoke-hook 'bbdb-create-hook new-record)
348 (bbdb-change-record new-record t)
349 (bbdb-hash-record new-record)
350 (bbdb-display-records (list new-record))))
352 (defun bbdb-merge-internally (old-record new-record)
353 "Merge two records. NEW-RECORDS wins over OLD in cases of ties."
354 (if (and (null (bbdb-record-firstname new-record))
355 (bbdb-record-firstname old-record))
356 (bbdb-record-set-firstname new-record (bbdb-record-firstname old-record)))
357 (if (and (null (bbdb-record-lastname new-record))
358 (bbdb-record-lastname old-record))
359 (bbdb-record-set-lastname new-record (bbdb-record-lastname old-record)))
360 (if (and (null (bbdb-record-company new-record))
361 (bbdb-record-company old-record))
362 (bbdb-record-set-company new-record (bbdb-record-company old-record)))
364 (let ((old-nets (bbdb-record-net old-record))
365 (new-nets (bbdb-record-net new-record)))
367 (if (not (member (car old-nets) new-nets))
368 (setq new-nets (append new-nets (list (car old-nets)))))
369 (setq old-nets (cdr old-nets)))
370 (bbdb-record-set-net new-record new-nets))
372 (let ((old-addresses (bbdb-record-addresses old-record))
373 (new-addresses (bbdb-record-addresses new-record)))
375 (if (not (member (car old-addresses) new-addresses))
376 (setq new-addresses (append new-addresses (list (car old-addresses)))))
377 (setq old-addresses (cdr old-addresses)))
378 (bbdb-record-set-addresses new-record new-addresses))
380 (let ((old-phones (bbdb-record-phones old-record))
381 (new-phones (bbdb-record-phones new-record)))
383 (if (not (member (car old-phones) new-phones))
384 (setq new-phones (append new-phones (list (car old-phones)))))
385 (setq old-phones (cdr old-phones)))
386 (bbdb-record-set-phones new-record new-phones))
388 (let ((old-notes (bbdb-record-raw-notes old-record))
389 (new-notes (bbdb-record-raw-notes new-record)))
391 (if (not (member (car old-notes) new-notes))
392 (setq new-notes (append new-notes (list (car old-notes)))))
393 (setq old-notes (cdr old-notes)))
394 (bbdb-record-set-raw-notes new-record new-notes))
398 ;;----------------------------------------------------------------------------
400 (if (fboundp 'replace-in-string)
401 (fset 'bbdb-replace-in-string 'replace-in-string)
402 (if (fboundp 'replace-regexp-in-string) ; defined in e21
403 (fset 'bbdb-replace-regexp-in-string 'replace-regexp-in-string)
404 ;; actually this is `dired-replace-in-string' slightly modified
405 ;; We're not defining the whole thing, just enough for our purposes.
406 (defun bbdb-replace-regexp-in-string (regexp newtext string &optional
408 ;; Replace REGEXP with NEWTEXT everywhere in STRING and return result.
409 ;; NEWTEXT is taken literally---no \\DIGIT escapes will be recognized.
410 (let ((result "") (start 0) mb me)
411 (while (string-match regexp string start)
412 (setq mb (match-beginning 0)
414 result (concat result (substring string start mb) newtext)
416 (concat result (substring string start)))))
417 (defun bbdb-replace-in-string (string regexp newtext &optional literal)
418 (bbdb-replace-regexp-in-string regexp newtext string nil literal))))
420 (defcustom bbdb-extract-address-component-regexps
422 ;; "surname, firstname" <address> from Outlookers
423 ("\"\\([^\"]*\\)\"\\s-*<\\([^>]+\\)>"
424 (bbdb-clean-username (match-string 1 adstring)) 2)
427 ("\\([^<>,\t][^<>,]+[^<>, \t]\\)\\s-*<\\([^>]+\\)>"
430 ("<\\([^>,]+\\)>" nil 1)
432 ("\\(\\b[^<\",()]+\\b\\)\\s-*(\\([^)]+\\))"
433 (car (mail-extract-address-components
434 (concat "\"" (match-string 2 adstring) "\"")))
436 ;; firstname.lastname@host
437 ("\\b\\(\\([^@ \t\n.]+\\.[^@ \t\n.]+\\)@[^@ \t\n]+\\)\\b"
438 (car (mail-extract-address-components
439 (concat "\"" (match-string 2 adstring) "\"")))
442 ("\\b\\(\\([^@ \t\n]+\\)@[^@ \t\n]+\\)\\b"
445 ("\\b\\([^@ \t\n]+\\)\\b"
448 "*List of regexps matching headers.
449 Each list element should have the form (REGEXP FULLNAME ADDRESS), where
450 REGEXP matches the address while the actual address components should
451 be a parenthesized expression.
453 FULLNAME is a default string for addresses without full name or a
454 number denoting parenthesized expression.
455 ADDRESS is a number denoting the parenthesized expression matching the
458 If FULLNAME or ADDRESS is a list it will be evaluated to return a
459 string or nil. If its a function it will be called with the remaining
460 address-string as argument."
461 :group 'bbdb-noticing-records
464 (defcustom bbdb-extract-address-component-ignore-regexp
465 "\\(\\(undisclosed\\|unlisted\\)[^,]*recipients\\)\\|no To-header on input"
466 "*A regexp matching addresses which should be ignored."
467 :group 'bbdb-noticing-records
470 (defcustom bbdb-extract-address-component-handler 'message
471 "*Specifies how `bbdb-extract-address-components' reports errors.
473 A value of nil means ignore unparsable stuff and 'warn will report
474 a warning, 'message will report a message in the minibuffer and all
475 other value will fire a error.
477 When set to a function it will be called with the remaining string in
478 order to extract the address components and return the rest and the
479 components as list or to do what ever it wants, e.g. send a complain
482 To skip known unparseable stuff you rather should set the variable
483 `bbdb-extract-address-component-ignore-regexp' instead of disabling
485 :group 'bbdb-noticing-records
486 :type '(choice (const :tag "Ignore problems."
488 (const :tag "Warn about parsing problems."
490 (const :tag "Show a message about parsing problems."
492 (function :tag "A user defined handler")))
495 (defun bbdb-extract-address-components (adstring &optional ignore-errors)
496 "Return a list of address components found in ADSTRING.
497 If extracting fails one probably has to adjust the variable
498 `bbdb-extract-address-component-regexps'."
499 (let ((case-fold-search t)
504 ;; Do some string cleanup and trimming
505 (setq adstring (bbdb-replace-in-string adstring "[\n\t]" " "))
506 (setq adstring (bbdb-replace-in-string adstring " " " "))
507 (setq adstring (bbdb-replace-in-string adstring "^ +" ""))
510 (while (not (string= "" adstring))
511 (setq adcom-regexp bbdb-extract-address-component-regexps
514 (let ((regexp (caar adcom-regexp))
515 (fn (car (cdar adcom-regexp)))
516 (ad (cadr (cdar adcom-regexp))))
519 bbdb-extract-address-component-ignore-regexp
520 "\\)[^,]*\\(,\\|$\\)")
522 (setq adstring (substring adstring (match-end 0))
525 ((string-match (concat "^\\s-*" regexp "\\s-*\\(,\\|$\\)")
527 (add-to-list 'fnadlist
530 (match-string fn adstring))
532 (save-match-data (eval fn)))
535 (funcall fn adstring)))
542 (match-string ad adstring))
544 (save-match-data (eval ad)))
547 (funcall ad adstring)))
553 ; (message "%S Match on %S to\n\t%S"
554 ; regexp adstring fnadlist))
555 (setq adstring (substring adstring (match-end 0))
558 (setq adcom-regexp (cdr adcom-regexp))))
560 ;; Now handle problems
561 (if (and nomatch (not ignore-errors))
562 (cond ((equal bbdb-extract-address-component-handler nil))
563 ((equal bbdb-extract-address-component-handler 'warn)
564 (bbdb-warn "Cannot extract an address component at \"%s\".
565 See `bbdb-extract-address-component-handler' for more information."
567 ((equal bbdb-extract-address-component-handler 'message)
568 (message "Cannot extract an address component at \"%s\"."
570 ((functionp bbdb-extract-address-component-handler)
572 (funcall bbdb-extract-address-component-handler
574 (if (and (listp result) (= 3 (length result)))
575 (progn (add-to-list 'fnadlist (cdr result))
576 (setq adstring (car result)
579 (error "Cannot extract an address component at \"%30s\""
582 ;; ignore the bad junk
584 (if (string-match "^[^,]*," adstring)
585 (setq adstring (substring adstring (match-end 0)))
586 (setq adstring ""))))
588 (delete '(nil nil) (nreverse fnadlist))))
590 ;;; alternative name parser
592 (defun bbdb-rfc822-addresses ( addrline &optional ignore-errors)
593 "Split ADDRLINE into a list of parsed addresses.
595 You can't do this with rfc822.el in any sort of useful way because it discards
596 the comments. You can't do this with mail-extr.el because the multiple address
597 parsing in GNU Emacs appears to be broken beyond belief, and the XEmacs
598 version doesn't support multiple addresses."
599 (let (addrs (start 0))
600 (setq addrline (concat addrline ",")) ;; kludge, to make parsing easier
601 ;; Addresses are separated by commas. This is probably the worst
602 ;; possible way to do this, but it does cut down on the amount of
603 ;; coding effort I have to duplicate. Basically, we split on
604 ;; commas, and then try and parse what we've found. Pathologically
605 ;; bad address lines will break this.
606 (while (string-match "\\([^,]+\\)," addrline start)
607 (let* ((thisaddr (substring addrline 0 (match-end 1)))
608 (comma (match-end 0)) ;; rfc822-addresses trashes match-data
609 (parsed (rfc822-addresses thisaddr)))
610 (if (string-match "(" (or (car parsed) "")) ;; rfc822 didn't like it.
614 (mail-extract-address-components
616 ;; throw away what we just parsed
617 addrline (substring addrline comma)
621 (provide 'bbdb-snarf)