1 ;;; edict-morphology.el --- morphology rewrite engine for edict.el
3 ;; Copyright (C) 1992 Bob Kerns <rwk@crl.dec.com>
4 ;; Copyright (C) 1991, 1992 Per Hammarlund (perham@nada.kth.se)
5 ;; Copyright (C) 1998, 2002 Free Software Foundation, Inc.
7 ;; Author: Per Hammarlund <perham@nada.kth.se>
8 ;; Keywords: mule, edict, dictionary
10 ;; Adapted-by: Stephen J. Turnbull <stephen@xemacs.org> for XEmacs
11 ;; Maintainer: Stephen J. Turnbull <stephen@xemacs.org>
13 ;; This file is part of XEmacs.
15 ;; XEmacs is free software; you can redistribute it and/or modify it
16 ;; under the terms of the GNU General Public License as published by
17 ;; the Free Software Foundation; either version 2, or (at your
18 ;; option) any later version.
20 ;; XEmacs is distributed in the hope that it will be useful, but
21 ;; WITHOUT ANY WARRANTY; without even the implied warranty of
22 ;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
23 ;; General Public License for more details.
25 ;; You should have received a copy of the GNU General Public License
26 ;; along with XEmacs; if not, write to the Free Software Foundation,
27 ;; Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
31 ;; Some code that looks for translations of english and japanese using the
32 ;; EDICTJ Public Domain japanese/english dictionary.
34 ;; Written by Per Hammarlund <perham@nada.kth.se>
35 ;; Morphology and private dictionary handling/editing by Bob Kerns
37 ;; Helpful remarks from Ken-Ichi Handa <handa@etl.go.jp>.
38 ;; The EDICTJ PD dictionary is maintained by Jim Breen
39 ;; <jwb@monu6.cc.monash.edu.au>
41 ;; Morphology rewrite engine
47 ;; 1998-03-27 Stephen Turnbull <stephen@xemacs.org>
48 ;; (created): broken out from monolithic edict.el
52 (require 'cl) ; for defstruct
54 ;; Set this to true for debugging.
55 (defvar *edict-expand-string-trace* nil)
59 ;; The edict-category-* variables are used to emulate the character
60 ;; categories for regexps that are (partially) documented, but not
61 ;; implemented, in XEmacs/Mule (20.5). It should be possible to use
62 ;; ranges for this, in the sense that defined legal characters in a
63 ;; given range are guaranteed to be of the appropriate category. The
64 ;; reason is that each Mule-defined character set will occupy such a
65 ;; range by virtue of the leading-byte implementation (see `Info |
66 ;; Internals | MULE Character Sets and Encodings | Internal Mule
67 ;; Encodings | Internal Character Encoding' for the easily understood
68 ;; character type representation; the Bufbyte representation is a
69 ;; simple transformation format of varying width). Then JIS, at
70 ;; least, carefully arranges the categories of characters into
71 ;; non-overlapping ranges (ranges > 96 code points (94 for JIS) are
72 ;; necessarily non-contiguous, see the info section cited above---this
73 ;; is why undefined or illegal characters cannot be ruled out). Mule
74 ;; itself is happy to insert undefined characters---try eval'ing
75 ;; (insert (int-char (+ (char-int ?
\e$Bt$
\e(B) 3)))---but balks at illegal
76 ;; ones---(insert (int-char (- (char-int ?
\e$B0!
\e(B) 1))). However, there
77 ;; are two holes in JIS X 0208 (between the yomi-ordered Level 1 kanji
78 ;; and the radical-ordered Level 2, and at the end of the character
79 ;; set) and these undefined characters can be inserted, eg by the LISP
82 ;; Regexp ranges cannot be depended on; they work for all JIS-derived
83 ;; encodings (including EUC, ISO-2022-JP, and SJIS), but won't for
84 ;; UCS/Unicode. Watch out for (eg) Big-5 in trying to generalize this
87 ;; The values for the ranges are taken from Ken Lunde, Understanding
88 ;; Japanese Information Processing, (C) 1993 O'Reilly & Associates,
89 ;; Sebastopol, CA, and from
\e$B6S8+!&9b66!&8MB<!&H>ED!&7,M}!&8~@n!&5HED!"
\e(B
90 ;;
\e$B%^%k%A%j%s%,%k4D6-$N<B8=!"%W%l%s%F%#%9%[!<%k=PHG
\e(B.
93 (defconst edict-category-c "[
\e$B0!
\e(B-
\e$Bt&
\e(B]"
94 "JIS X 0208-1990 kanji character category.
95 NB: The last two characters of the range will be invisible for most fonts
96 (based on JIS X 0208-1983).")
98 (defconst edict-category-C "[
\e$(D0!
\e(B-
\e$(Dmc
\e(B]"
99 "JIS X 0212-1990 kanji character category.
100 Computed as kuten 1601 - kuten 7767, leading byte 0x94, but not verified
101 (I don't have the fonts on hand.) #### Also, this may not be the official
102 definition of character category \cC in FSF Emacs 20.
103 Cannot be combined with JIS X 0208-1983/1990 (leading byte 0x92) since
104 Korean KSC 5601-1987 is interpolated (leading byte 0x93).")
106 (defconst edict-category-h "[
\e$B$!
\e(B-
\e$B$s
\e(B]"
107 "JIS X 0208-1990 hiragana character category.")
109 (defconst edict-category-H "[
\e$B~~
\e(B]"
110 "Unknown (half-width hiragana character?) category.
111 Currently implemented as the undefined character kuten 9494 in JIS X 0208.")
113 (defconst edict-category-k "[
\e$B%!
\e(B-
\e$B%t
\e(B]"
114 "JIS X 0208-1990 katakana character category.
115 Does not include the `small ka' and `small ke' characters (which would not
116 be appropriate in yomi).")
118 (defconst edict-category-K "[
\e(I&
\e(B-
\e(I_
\e(B]"
119 "JIS X 0201-1976 \"half-width\" katakana character category.")
121 (defconst edict-category-kana
122 (concat edict-category-h "\\|" edict-category-H
123 "\\|" edict-category-k "\\|" edict-category-K)
124 "Japanese kana (half- or full-width, kata- or hira-) character category.")
126 (defconst edict-category-Japanese-word-constituent
127 (concat edict-category-kana "\\|" edict-category-c "\\|" edict-category-C)
128 "Japanese word constituent character category.")
130 (defconst edict-dictionary-entry-start-regexp
131 (concat "^\\(\\(" edict-category-Japanese-word-constituent "\\)+\\)[ \t]")
132 "Matches start of an EDICT entry.")
134 (defconst edict-yomi-regexp
135 (concat "^\\(" edict-category-kana "\\)+$")
136 "Matches a pronunciation from an EDICT entry.")
138 (defconst edict-yomi-part-regexp
139 (concat "\\[^\\(" edict-category-kana "\\)+$")
140 "Matches a `yomi part' (includes delimiters) from an EDICT entry.")
143 ;; The syntax of the rules is:
144 ;; (define-edict-rule name <pattern> <conversion-function> <conversion-data>).
146 ;; <pattern> is a regular expression, with the parts to be substituted
147 ;; being denoted by \\(<subpattern>\\).
149 ;; <conversion function> is a function responsible for determining
150 ;; the replacements. The current choices are
151 ;; edict-subst-modified-affix and edict-subst-affix. These
152 ;; functions are called just after doing match-string, so the regexp
153 ;; variables are set up. They are applied to the string, and
154 ;; <conversion-data>. These functions are responsible for
155 ;; determining and performing the substitutions to be made, and
156 ;; returning a list of possiblities.
158 ;; edict-subst-affix is the simpler case. It takes as conversion
159 ;; data one string for each subpattern in the pattern. This string
160 ;; will be used in place of the original.
162 ;; edict-subst-modified-affix takes as conversion data, an
163 ;; alternating list of functions and lists of additional arguments
164 ;; for those functions. Each function is applied to the substring
165 ;; being replaced and its additional arguments. Likely functions to
166 ;; use include edict-modify-verb, edict-ignore, and edict-subst.
168 ;; Table of morphological rules.
169 (defvar *edict-syntax-types* nil)
171 ;; defstruct's defsetfs should expand into this; sigh.
172 ;; Maybe this is fixed, comment them out.
173 ;(eval-when (eval load compile)
174 (defstruct edict-syntax-type
179 (defun get-edict-syntax-type (name)
182 (dolist (s *edict-syntax-types*)
183 (when (eq (edict-syntax-type-name s) name)
184 (throw 'found-it s)))
185 (let ((new (make-edict-syntax-type :name name :rules ())))
186 (push new *edict-syntax-types*)
190 ;(eval-when (eval load compile)
191 (defstruct edict-rule
193 pattern ;Pattern which it must match
194 filter ;Syntactic filter on previous form
195 function ;Function to transform the input
196 additional-args ;Arguments to transform function
197 from-syntax-types ;Syntaxes for which this is valid
198 to-syntax-types) ;Syntaxes to consider after this rule.
201 ;; Delete all occurrances of a rule from the rule base.
202 (defun edict-delete-rule (name)
203 (dolist (s *edict-syntax-types*)
204 (let ((old (edict-get-rule-from-syntax-type name s)))
206 (setf (edict-syntax-type-rules s)
207 (delq old (edict-syntax-type-rules s)))))))
209 ;(defun edict-decircularize-rules ()
211 ; (dolist (s *edict-syntax-types*)
212 ; (dolist (r (edict-syntax-type-rules s))
213 ; (setf (edict-rule-from-syntax-types r)
214 ; (mapcar (function (lambda (type)
217 ; (edict-syntax-type-name type))))
218 ; (edict-rule-from-syntax-types r)))
219 ; (setf (edict-rule-to-syntax-types r)
220 ; (mapcar (function (lambda (type)
223 ; (edict-syntax-type-name type))))
224 ; (edict-rule-to-syntax-types r))))))
226 ;(defun edict-circularize-rules ()
228 ; (dolist (s *edict-syntax-types*)
229 ; (dolist (r (edict-syntax-type-rules s))
230 ; (setf (edict-rule-from-syntax-types r)
231 ; (mapcar (function (lambda (type)
233 ; (get-edict-syntax-type type)
235 ; (edict-rule-from-syntax-types r)))
236 ; (setf (edict-rule-to-syntax-types r)
237 ; (mapcar (function (lambda (type)
239 ; (get-edict-syntax-type type)
241 ; (edict-rule-to-syntax-types r))))))
243 (defun edict-add-rule (name rule)
244 (edict-delete-rule name)
245 (dolist (s (edict-rule-from-syntax-types rule))
246 (push rule (edict-syntax-type-rules s))))
248 (defun edict-get-rule-from-syntax-type (name syntax-type)
249 (catch 'edict-get-rule
250 (dolist (rule (edict-syntax-type-rules syntax-type))
251 (if (eq name (edict-rule-name rule))
252 (throw 'edict-get-rule rule)))))
254 (defmacro define-edict-rule (name pattern fromto function &rest additional-args)
255 ;; First, some type-checking.
259 ;; SJT: this needs to be a string. If it already is, that's ok.
260 (pattern (eval pattern)))
261 (when (stringp fromto)
265 (setq fromto '(
\e$BF|K\8l
\e(B
\e$BF|K\8l
\e(B)))
266 (setq from (first fromto)
269 (setq from (list from)))
272 (unless (string-match "^\\^\\|\\$$" pattern)
273 (error "Rule %s: pattern must start with ^ or end with $: \"%s\""
276 (unless (stringp filter)
277 (error "Rule %s: filter must be a regexp"
279 (` (define-edict-rule-internal '(, name) '(, pattern) '(, filter)
281 (function (, function)) (quote ((,@ additional-args)))))))
283 (defun define-edict-rule-internal (name pattern filter
284 from-syntax-types to-syntax-types
285 function additional-args)
286 (unless (string-match "^\\^\\|\\$$" pattern)
287 (error "Rule %s: pattern must start with ^ or end with $: \"%s\""
290 (unless (stringp filter)
291 (error "Rule %s: filter must be a regexp"
293 (let ((from-types nil)
295 (dolist (f from-syntax-types)
296 (push (get-edict-syntax-type f) from-types))
297 (dolist (to to-syntax-types)
298 (push (get-edict-syntax-type to) to-types))
300 (make-edict-rule :name name
303 :from-syntax-types from-types
304 :to-syntax-types to-types
306 :additional-args additional-args))
309 ;; #### This is bogus; the function does not match what the
310 ;; #### description above the rules says it satisfies. In particular,
311 ;; #### it is supposed to take only strings as arguments.
312 (defun edict-subst-affix (string &rest affixes)
317 (let ((pos (match-beginning i)))
318 ;; #### aren't the behaviors of edict-identity and
319 ;; edict-ignore reversed?
320 ;; #### Hmm ... maybe this code never triggers.
321 (cond ((eq x 'edict-identity))
322 ((eq x 'edict-ignore)
323 (setq result (concat result
326 (match-beginning i)))
328 ((and (symbolp x) (fboundp x))
334 (funcall x (substring string
338 (error "%s is not a string or function name in edict-subst-affix"
340 ((and pos (>= pos prev))
341 (setq result (concat result
346 (setq prev (match-end i))))
348 (concat result (substring string (max prev 0)))))
350 ;; Takes a series of alternating pairs of substitution functions
351 ;; and arguments for those substitution functions. This can be
352 ;; used to algorithmically replace certain parts (typically involving
353 ;; changing an
\e$B$$9T
\e(B to
\e$B$&9T
\e(B final character.
355 (defun edict-subst-modified-affix (string &rest affixes)
362 (setq fun (car affixes)
363 args (car (cdr affixes))
364 affixes (cdr (cdr affixes)))
365 (let ((pos (match-beginning i)))
366 ;; #### aren't the behaviors of edict-identity and
367 ;; edict-ignore reversed?
368 ;; #### Hmm ... maybe this code never triggers.
369 (cond ((eq fun 'edict-identity))
370 ((eq fun 'edict-ignore)
371 (setq result (concat result
374 (match-beginning i)))
376 ((not (or (stringp fun) (and (symbolp fun) (fboundp fun))))
377 (error "%s is not a string or function name in %s"
379 'edict-subst-modified-affix))
380 ((and pos (>= pos prev))
381 (setq result (concat result
382 (substring string (max prev 0) pos)
383 (apply fun (substring string
387 (setq prev (max prev (match-end i)))))
389 (concat result (substring string (max prev 0)))))
392 (defun edict-ignore (affix) "")
395 (defun edict-identity (affix) affix)
397 ;; Substitute for this piece
398 (defun edict-subst (affix data)
401 ;; More or less a guon table, for converting doshi suffixes.
402 (defvar *edict-doshi-suffix*
403 '(["
\e$B$o
\e(B" "
\e$B$$
\e(B" "
\e$B$&
\e(B" "
\e$B$(
\e(B" "
\e$B$*
\e(B"];; u -> wa; kau->kawanai
404 ["
\e$B$+
\e(B" "
\e$B$-
\e(B" "
\e$B$/
\e(B" "
\e$B$1
\e(B" "
\e$B$3
\e(B"]
405 ["
\e$B$,
\e(B" "
\e$B$.
\e(B" "
\e$B$0
\e(B" "
\e$B$2
\e(B" "
\e$B$4
\e(B"]
406 ["
\e$B$5
\e(B" "
\e$B$7
\e(B" "
\e$B$9
\e(B" "
\e$B$;
\e(B" "
\e$B$=
\e(B"]
407 ["
\e$B$6
\e(B" "
\e$B$8
\e(B" "
\e$B$:
\e(B" "
\e$B$<
\e(B" "
\e$B$>
\e(B"]
408 ["
\e$B$?
\e(B" "
\e$B$A
\e(B" "
\e$B$D
\e(B" "
\e$B$F
\e(B" "
\e$B$H
\e(B"]
409 ["
\e$B$@
\e(B" "
\e$B$B
\e(B" "
\e$B$E
\e(B" "
\e$B$G
\e(B" "
\e$B$I
\e(B"]
410 ["
\e$B$J
\e(B" "
\e$B$K
\e(B" "
\e$B$L
\e(B" "
\e$B$M
\e(B" "
\e$B$N
\e(B"]
411 ["
\e$B$O
\e(B" "
\e$B$R
\e(B" "
\e$B$U
\e(B" "
\e$B$X
\e(B" "
\e$B$[
\e(B"]
412 ["
\e$B$P
\e(B" "
\e$B$S
\e(B" "
\e$B$V
\e(B" "
\e$B$Y
\e(B" "
\e$B$\
\e(B"]
413 ["
\e$B$Q
\e(B" "
\e$B$T
\e(B" "
\e$B$W
\e(B" "
\e$B$Z
\e(B" "
\e$B$]
\e(B"]
414 ["
\e$B$^
\e(B" "
\e$B$_
\e(B" "
\e$B$`
\e(B" "
\e$B$a
\e(B" "
\e$B$b
\e(B"]
415 ["
\e$B$i
\e(B" "
\e$B$j
\e(B" "
\e$B$k
\e(B" "
\e$B$l
\e(B" "
\e$B$m
\e(B"]))
417 (defun edict-modify-verb (suffix from to)
419 (dolist (b *edict-doshi-suffix*)
420 (if (equal suffix (aref b from))
421 (throw 'exit (aref b to))))
422 (throw 'skip-rule nil)))
424 ;; This returns a list of the results of applying all rules whose
425 ;; patterns match, to all levels of recursion.
426 (defun edict-expand-string (string &optional others previous syntax)
428 (syntax (or syntax '
\e$BF|K\8l
\e(B))
429 (stype (get-edict-syntax-type syntax)))
430 (dolist (rule (edict-syntax-type-rules stype))
431 (when (string-match (edict-rule-pattern rule) string)
433 (unless (and previous
434 (edict-rule-filter rule)
435 (edict-filter-rule rule previous))
436 (let ((temp (apply (edict-rule-function rule) string
437 (edict-rule-additional-args rule))))
438 (unless (or (equal temp string)
440 (member temp result))
441 (when *edict-expand-string-trace*
442 (read-string (format "%s: %s -> %s -:"
443 (edict-rule-name rule)
446 (union (edict-expand-string-recurse
447 temp (cons string (append result others))
450 (if (member string result)
452 (cons string result))))
454 (defun edict-expand-string-recurse (string others previous rule)
455 (edict-expand-string-syntaxes string others previous
456 (edict-rule-to-syntax-types rule)))
458 (defun edict-expand-string-syntaxes (string others previous syntaxes)
460 (dolist (syntax syntaxes)
462 (union (edict-expand-string string
463 (append result others)
470 ;; Returns T if the rule should not be run, because of the past
471 ;; history of expansions. I.e. if something started out with
\e$B$/
\e(Bon
472 ;; the end, and we've made it into an adjective, we should disable
473 ;; any expansions based on it being a the conjunctive/stem form of a
474 ;; verb. This is done purely based on the most immediately preceding
475 ;; expansion, because that is what determined the sense of the word.
477 (defun edict-filter-rule (rule previous)
478 (let ((filter (edict-rule-filter rule)))
479 (cond ((null filter) nil)
480 ((null previous) nil)
482 (string-match filter previous))
483 ;; #### This code is not functional yet, let those cases signal errors.
485 ; (funcall filter frob))
487 ; (apply (car filter) frob (cdr filter)))
488 (t (error "Bogus filter in rule %s: %s"
489 (edict-rule-name rule)
492 ;(defun edict-find (elt list)
494 ; (dolist (test list)
495 ; (when (equal elt test)
496 ; (throw 'edict-find test)))))
498 ;(defun edict-union (set1 set2)
499 ; (let ((result set2))
500 ; (dolist (frob set1)
501 ; (unless (member frob set2)
502 ; (setq result (cons frob result))))
505 (provide 'edict-morphology)
507 ;;; edict-morphology.el ends here