1 ;;; stem.el ---- routines for stemming
2 ;;; $Id: stem-english.el,v 1.1.1.1 1999/04/03 03:02:56 satoru-lookup Exp $
4 ;;; Author: Tsuchiya Masatoshi <tsuchiya@pine.kuee.kyoto-u.ac.jp>
9 ;;
\e$BO@J8!X
\e(BAn algorithm for suffix stripping (M.F.Porter)
\e$B!Y$K5-=R$5$l$F
\e(B
10 ;;
\e$B$$$k%"%k%4%j%:%`$K4p$E$$$F!"1QC18l$N8lHx$r<h$j=|$/$?$a$N%i%$%V%i%j!#
\e(B
11 ;;
\e$BMxMQ5Z$S:FG[I[$N:]$O!"
\e(BGNU
\e$B0lHL8xMQ5vBz=q$NE,Ev$J%P!<%8%g%s$K$7$?$,$C
\e(B
15 ;; http://www-nagao.kuee.kyoto-u.ac.jp/member/tsuchiya/elisp/xdic.html
22 (defvar stem:minimum-word-length 4 "Porter
\e$B$N%"%k%4%j%:%`$,E,MQ$G$-$k:G>.8lD9
\e(B")
25 ;;;============================================================
26 ;;;
\e$BHs8x3+4X?t
\e(B
27 ;;;============================================================
29 ;;
\e$BF0:nB.EY$r8~>e$5$;$k$?$a$K!"4X?tFbIt$G30ItJQ?t$r$$$8$C$F$$$k
\e(B
30 ;;
\e$B4X?t$,$"$j!"M=4|$7$J$$I{:nMQ$,H/@8$9$k2DG=@-$,9b$$!#=>$C$F!"
\e(B
31 ;;
\e$BHs8x3+4X?t$rD>@\8F$S=P$9$3$H$OHr$1$k$3$H!#
\e(B
33 ;;------------------------------------------------------------
34 ;; stemming-rule
\e$B$N>r7o@a$r5-=R$9$k4X?t72
\e(B
35 ;;------------------------------------------------------------
37 (defsubst stem:match (arg) "\
38 \e$BJQ?t
\e(B str
\e$B$r8!::$9$kHs8x3+4X?t
\e(B (
\e$B8l44$NItJ,$rJQ?t
\e(B stem
\e$B$KBeF~$9$k
\e(B)"
40 (string-match arg str)
41 (setq stem (substring str 0 (match-beginning 0)))))
43 (defsubst stem:m () "\
44 \e$BJQ?t
\e(B stem
\e$B$K4^$^$l$F$$$k
\e(B VC
\e$B$N?t$r5a$a$kHs8x3+4X?t
\e(B"
47 (while (string-match "\\(a\\|e\\|i\\|o\\|u\\|[^aeiou]y+\\)[aeiou]*" stem pos)
49 (setq pos (match-end 0)))
50 (if (= pos (length stem)) (1- m) m))))
52 (defsubst stem:m> (i) "\
53 \e$BJQ?t
\e(B stem
\e$B$K4^$^$l$F$$$k
\e(B VC
\e$B$N?t$N>r7o$r5-=R$9$kHs8x3+4X?t
\e(B"
56 (defsubst stem:m= (i) "\
57 \e$BJQ?t
\e(B stem
\e$B$K4^$^$l$F$$$k
\e(B VC
\e$B$N?t$N>r7o$r5-=R$9$kHs8x3+4X?t
\e(B"
60 (defsubst stem:*v* () "\
61 \e$BJQ?t
\e(B stem
\e$B$,Jl2;$r4^$s$G$$$k$+8!::$9$k4X?t
\e(B"
63 (if (string-match "\\(a\\|e\\|i\\|o\\|u\\|[^aeiou]y\\)" stem) t)))
65 (defsubst stem:*o () "\
66 \e$BJQ?t
\e(B stem
\e$B$,
\e(B cvc
\e$B$N7A$G=*$C$F$$$k$+8!::$9$k4X?t
\e(B"
68 (if (string-match "[^aeiou][aeiouy][^aeiouwxy]$" stem) t)))
72 ;;------------------------------------------------------------
73 ;; stemming-rule
\e$B$r5-=R$7$?4X?t72
\e(B
74 ;;------------------------------------------------------------
76 (defun stem:step1a (str) "
\e$BBh
\e(B1a
\e$BCJ3,$N
\e(B stemming rule (
\e$BHs8x3+4X?t
\e(B)"
79 ((stem:match "sses$") "ss")
80 ((stem:match "ies$") "i")
81 ((stem:match "ss$") "ss")
82 ((stem:match "s$") "")))
87 (defun stem:step1b (str) "
\e$BBh
\e(B1b
\e$BCJ3,$N
\e(B stemming rule (
\e$BHs8x3+4X?t
\e(B)"
90 ((and (stem:match "eed$") (stem:m> 0))
92 ((or (and (not stem) (stem:match "ed$") (stem:*v*))
93 (and (stem:match "ing$") (stem:*v*)))
94 (if (and (stem:m= 1) (stem:*o))
98 ((stem:match "at$") "ate")
99 ((stem:match "bl$") "ble")
100 ((stem:match "iz$") "ize")
101 ((stem:match "\\([^lsz]\\)\\1$")
102 (substring str (match-beginning 1) (match-end 1)))))
108 (defun stem:step1c (str) "
\e$BBh
\e(B1c
\e$BCJ3,$N
\e(B stemming rule (
\e$BHs8x3+4X?t
\e(B)"
110 (if (and (stem:match "y$")
116 (defun stem:step1 (str) "
\e$BBh
\e(B1
\e$BCJ3,$N
\e(B stemming rule (
\e$BHs8x3+4X?t
\e(B)"
122 (defun stem:step2 (str) "
\e$BBh
\e(B2
\e$BCJ3,$N
\e(B stemming rule (
\e$BHs8x3+4X?t
\e(B)"
126 ((stem:match "ational$") "ate")
127 ((stem:match "tional$") "tion")
128 ((stem:match "enci$") "ence")
129 ((stem:match "anci$") "ance")
130 ((stem:match "izer$") "ize")
131 ((stem:match "abli$") "able")
132 ((stem:match "alli$") "al")
133 ((stem:match "entli$") "ent")
134 ((stem:match "eli$") "e")
135 ((stem:match "ousli$") "ous")
136 ((stem:match "ization$") "ize")
137 ((stem:match "ation$") "ate")
138 ((stem:match "ator$") "ate")
139 ((stem:match "alism$") "al")
140 ((stem:match "iveness$") "ive")
141 ((stem:match "fulness$") "ful")
142 ((stem:match "ousness$") "ous")
143 ((stem:match "aliti$") "al")
144 ((stem:match "iviti$") "ive")
145 ((stem:match "biliti$") "ble")))
151 (defun stem:step3 (str) "
\e$BBh
\e(B3
\e$BCJ3,$N
\e(B stemming rule (
\e$BHs8x3+4X?t
\e(B)"
155 ((stem:match "icate$") "ic")
156 ((stem:match "ative$") "")
157 ((stem:match "alize$") "al")
158 ((stem:match "iciti$") "ic")
159 ((stem:match "ical$") "ic")
160 ((stem:match "ful$") "")
161 ((stem:match "ness$") "")))
167 (defun stem:step4 (str) "
\e$BBh
\e(B4
\e$BCJ3,$N
\e(B stemming rule (
\e$BHs8x3+4X?t
\e(B)"
178 (stem:match "ement$")
181 (and (string-match "[st]\\(ion\\)$" str)
182 (setq stem (substring str 0 (match-beginning 1))))
194 (defun stem:step5 (str) "
\e$BBh
\e(B5
\e$BCJ3,$N
\e(B stemming rule (
\e$BHs8x3+4X?t
\e(B)"
197 (and (stem:match "e$")
201 (and (stem:match "ll$")
202 (setq stem (concat stem "l"))
207 (defvar stem:irregular-verb-alist
210 ("alighted" "alight")
216 ("baby-sat" "baby-sit")
217 ("backbit" "backbite")
218 ("backbitten" "backbite")
219 ("backslid" "backslide")
220 ("backslidden" "backslide")
221 ("was" "be" "am" "is" "are")
222 ("were" "be" "am" "is" "are")
223 ("been" "be" "am" "is" "are")
231 ("befallen" "befall")
238 ("begirded" "begird")
242 ("bereaved" "bereave")
244 ("besought" "beseech")
245 ("beseeched" "beseech")
247 ("bespoke" "bespeak")
248 ("bespoken" "bespeak")
249 ("bestrewed" "bestrew")
250 ("bestrewn" "bestrew")
251 ("bestrode" "bestride")
252 ("bestrid" "bestride")
253 ("bestridden" "bestride")
258 ("bethought" "bethink")
276 ("bottle-fed" "bottle-feed")
279 ("breast-fed" "breast-feed")
282 ("broadcast" "broadcast")
283 ("broadcasted" "broadcast")
284 ("browbeat" "browbeat")
285 ("browbeaten" "browbeat")
308 ("colorcast" "colorcast")
309 ("clorcasted" "colorcast")
314 ("countersank" "countersink")
315 ("countersunk" "countersink")
317 ("crossbred" "crossbreed")
326 ("deep-froze" "deep-freeze")
327 ("deep-freezed" "deep-freeze")
328 ("deep-frozen" "deep-freeze")
358 ("fled" "fly" "flee")
363 ("forbore" "forbear")
364 ("forborne" "forbear")
367 ("forbidden" "forbid")
368 ("forecast" "forecast")
369 ("forecasted" "forecast")
370 ("forewent" "forego")
371 ("foregone" "forego")
372 ("foreknew" "foreknow")
373 ("foreknown" "foreknow")
374 ("foreran" "forerun")
375 ("forerun" "forerun")
376 ("foresaw" "foresee")
377 ("foreseen" "foresee")
378 ("foreshowed" "foreshow")
379 ("foreshown" "foreshow")
380 ("foretold" "foretell")
382 ("forgotten" "forget")
383 ("forgave" "forgive")
384 ("forgiven" "forgive")
387 ("forsook" "forsake")
388 ("forsaken" "forsake")
389 ("forswore" "forswear")
390 ("forsworn" "forswear")
393 ("gainsaid" "gainsay")
398 ("ghostwrote" "ghostwrite")
399 ("ghostwritten" "ghostwrite")
417 ("hamstrung" "hamstring")
418 ("hamstringed" "hamstring")
432 ("indwelt" "indwell")
439 ("interwove" "interweave")
440 ("interweaved" "interweave")
441 ("jigsawed" "jigsaw")
467 ("lip-read" "lip-read")
473 ("methougt" "methinks")
475 ("misbecame" "misbecome")
476 ("misbecome" "misbecome")
477 ("miscast" "miscast")
478 ("miscasted" "miscast")
479 ("misdealt" "misdeal")
482 ("misgave" "misgive")
483 ("misgiven" "misgive")
487 ("misread" "misread")
488 ("misspelt" "misspell")
489 ("missplled" "misspell")
490 ("misspent" "misspend")
491 ("mistook" "mistake")
492 ("mistaken" "mistake")
493 ("misunderstood" "misunderstand")
499 ("outbidden" "outbid")
502 ("outfought" "outfight")
503 ("outgrew" "outgrown")
504 ("outgrown" "outgrown")
507 ("outputted" "output")
508 ("ooutputted" "output")
509 ("outrode" "outride")
510 ("outridden" "outride")
513 ("outsold" "outsell")
514 ("outshone" "outshine")
515 ("outshot" "outshoot")
516 ("outwore" "outwear")
517 ("outworn" "outwear")
518 ("overbore" "overbear")
519 ("overborne" "overbear")
520 ("overbid" "overbid")
521 ("overblew" "overblow")
522 ("overblown" "overblow")
523 ("overcame" "overcome")
524 ("overcome" "overcome")
526 ("overdone" "overdo")
527 ("overdrew" "overdraw")
528 ("overdrawn" "overdraw")
529 ("overdrank" "overdrink")
530 ("overdrunk" "overdrink")
531 ("overate" "overeat")
532 ("overeaten" "overeat")
533 ("overfed" "overfeed")
534 ("overflowed" "overflow")
535 ("overflown" "overfly" "overflow")
536 ("overflew" "overfly")
537 ("overgrew" "overgrow")
538 ("overgrown" "overgrow")
539 ("overhung" "overhang")
540 ("overhanged" "overhang")
541 ("ovearheard" "overhear")
542 ("overlaid" "overlay")
543 ("overleaped" "overleap")
544 ("overleapt" "overleap")
545 ("overlay" "overlie")
546 ("overlain" "overlie")
547 ("overpaid" "overpay")
548 ("overrode" "override")
549 ("overridden" "override")
550 ("overran" "overrun")
551 ("overrun" "overrun")
552 ("oversaw" "oversee")
553 ("overseen" "oversee")
554 ("oversold" "oversell")
555 ("overset" "overset")
556 ("overshot" "overshoot")
557 ("overspent" "overspend")
558 ("overspread" "overspread")
559 ("overtook" "overtake")
560 ("overtaken" "overtake")
561 ("overthrew" "overthrow")
562 ("overthrown" "overthrow")
563 ("overworked" "overwork")
564 ("overwrought" "overwork")
565 ("partook" "partake")
566 ("partaken" "partake")
570 ("pinch-hit" "pinch-hit")
576 ("proofread" "proofread")
580 ("quick-froze" "quick-freeze")
581 ("quick-frozen" "quick-freeze")
588 ("rebroadcast" "rebroadcast")
589 ("rebroadcasted" "rebroadcast")
590 ("rebuilt" "rebuild")
592 ("recasted" "recast")
610 ("rethought" "rethink")
612 ("rewinded" "rewind")
613 ("rewrote" "rewrite")
614 ("rewritten" "rewrite")
615 ("rid" "ride") ;; ("rid" "ride" "rid")
625 ("roughcast" "roughcast")
626 ("roughhewed" "roughhew")
627 ("roughhewn" "roughhew")
663 ("shrunken" "shrink")
668 ("sight-read" "sight-read")
669 ("simulcast" "simulcast")
670 ("simulcasted" "simulcast")
697 ("spellbound" "spellbind")
708 ("spoon-fed" "spoon-feed")
726 ("stridden" "stride")
728 ("stricken" "strike")
733 ("sunburned" "sunburn")
734 ("sunburnt" "sunburn")
751 ("telecast" "telecast")
752 ("telecasted" "telecast")
767 ("typecast" "typecast")
768 ("typewrote" "typewrite")
769 ("typewritten" "typewrite")
771 ("unbended" "unbend")
773 ("underbid" "underbid")
774 ("underbidden" "underbid")
775 ("undercut" "undercut")
776 ("underwent" "undergo")
777 ("undergone" "undergo")
778 ("underlaid" "underlay")
779 ("underlay" "underlie")
780 ("underlain" "underlie")
781 ("underpaid" "underpay")
782 ("undersold" "undersell")
783 ("undershot" "undershoot")
784 ("understood" "understand")
785 ("undertook" "undertake")
786 ("undertaken" "undertake")
787 ("underwrote" "underwrite")
788 ("underwritten" "underwrite")
793 ("ungirded" "ungird")
795 ("unlearnt" "unlearn")
796 ("unlearned" "unlearn")
799 ("unstuck" "unstick")
800 ("unstrung" "unstring")
806 ("upswept" "upsweep")
825 ("wiredrew" "wiredraw")
826 ("wiredrawn" "wiredraw")
828 ("withdrew" "withdraw")
829 ("withdrawn" "withdraw")
830 ("withheld" "withhold")
831 ("withstood" "withstand")
840 "
\e$BIT5,B'F0;l$H867A$NO"A[G[Ns
\e(B")
843 (defun stem:extra (str) "\
844 \e$BF0;l
\e(B/
\e$B7AMF;l$N3hMQ7A$HL>;l$NJ#?t7A$N3hMQ8lHx$r<h$j=|$/Hs8x3+4X?t
\e(B
845 \e$BM?$($i$l$?8l$N867A$H$7$F2DG=@-$N$"$k8l$N%j%9%H$rJV$9
\e(B"
846 (or (assoc str stem:irregular-verb-alist)
849 ;;
\e$BHf3S5i
\e(B/
\e$B:G>e5i
\e(B
850 ((stem:match "\\([^aeiou]\\)\\1e\\(r\\|st\\)$")
851 (list (substring str (match-beginning 1) (match-end 1))
852 (substring str (match-beginning 0) (match-beginning 2))))
853 ((stem:match "\\([^aeiou]\\)ie\\(r\\|st\\)$")
854 (setq c (substring str (match-beginning 1) (match-end 1)))
855 (list c (concat c "y") (concat c "ie")))
856 ((stem:match "e\\(r\\|st\\)$") '("" "e"))
857 ;; 3
\e$BC18=
\e(B/
\e$BJ#?t7A
\e(B
858 ((stem:match "ches$") '("ch" "che"))
859 ((stem:match "shes$") '("sh" "che"))
860 ((stem:match "ses$") '("s" "se"))
861 ((stem:match "xes$") '("x" "xe"))
862 ((stem:match "zes$") '("z" "ze"))
863 ((stem:match "ves$") '("f" "fe"))
864 ((stem:match "\\([^aeiou]\\)oes$")
865 (setq c (substring str -4 -3))
866 (list c (concat c "o") (concat c "oe")))
867 ((stem:match "\\([^aeiou]\\)ies$")
868 (setq c (substring str -4 -3))
869 (list c (concat c "y") (concat c "ie")))
870 ((stem:match "es$") '("" "e"))
871 ((stem:match "s$") '(""))
872 ;;
\e$B2a5n7A
\e(B/
\e$B2a5nJ,;l
\e(B
873 ((stem:match "\\([^aeiou]\\)ied$")
874 (setq c (substring str -4 -3))
875 (list c (concat c "y") (concat c "ie")))
876 ((stem:match "\\([^aeiou]\\)\\1ed$")
877 (list (substring str -4 -3)
878 (substring str -4 -1)))
879 ((stem:match "cked$") '("c" "cke"))
880 ((stem:match "ed$") '("" "e"))
882 ((stem:match "\\([^aeiou]\\)\\1ing$")
883 (list (substring str -5 -4)))
884 ((stem:match "ing$") '("" "e"))
886 (append (mapcar '(lambda (s) (concat stem s)) l)
892 ;;;============================================================
894 ;;;============================================================
896 (defun stem:stripping-suffix (str) "\
897 \e$B3hMQ8lHx$r<h$j=|$/4X?t
\e(B
898 \e$BM?$($i$l$?8l$N85$N8l$H$7$F2DG=@-$N$"$k8l$N<-=q=g$N%j%9%H$rJV$9
\e(B"
900 (delq nil (let ((w ""))
902 (function (lambda (x) (if (string= x w) nil (setq w x))))
904 ;;
\e$BBgJ8;z$r>.J8;z$KJQ49
\e(B
905 (list (prog1 str (setq str (downcase str))))
906 ;;
\e$BFH<+$N%R%e!<%j%9%F%#%C%/%9$rE,MQ
\e(B
908 (if (> (length str) stem:minimum-word-length)
909 ;;
\e$BC18lD9$,>r7o$rK~$?$;$P!"
\e(BPorter
\e$B$N%"%k%4%j%:%`$rE,MQ
\e(B
912 (setq str (funcall func str)))
913 '(stem:step1 stem:step2 stem:step3 stem:step4 stem:step5))))
918 (defun stem-english (str) "\
919 \e$B3hMQ8lHx$r<h$j=|$/4X?t
\e(B
920 \e$BM?$($i$l$?8l$N85$N8l$H$7$F2DG=@-$N$"$k8l$NJ8;zNsD9$N>:=g$N%j%9%H$rJV$9
\e(B"
921 (sort (stem:stripping-suffix str)
922 (function (lambda (a b) (< (length a) (length b))))))
924 ;;
\e$B$3$N
\e(B stem-english
\e$B$NF0:n$O!"
\e(B
926 ;; Id: stem.el,v 1.4 1998/11/30 09:27:27 tsuchiya Exp tsuchiya
928 ;;
\e$B0JA0$N%P!<%8%g%s$N
\e(B stem.el
\e$B$GDj5A$5$l$F$$$?
\e(B stem:stripping-suffix
929 ;;
\e$B$NF0:n$H8_49$G$"$k!#8=:_$N
\e(B stem:stripping-suffix
\e$B$O<-=q=g$N%j%9%H$r
\e(B
930 ;;
\e$BJV$9$?$a!"0[$J$kF0:n$H$9$k$h$&$K$J$C$F$$$k$N$GCm0U$9$k$3$H!#
\e(B
933 ;;; Porter
\e$B$N%"%k%4%j%:%`$rE,MQ$9$k4X?t
\e(B
934 (defun stem:stripping-inflection (word) "\
935 Porter
\e$B$N%"%k%4%j%:%`$K4p$E$$$FGI@88l$r=hM}$9$k4X?t
\e(B"
941 (stem:step1 word)))))))