mirror of
https://github.com/sphinx-doc/sphinx.git
synced 2025-02-25 18:55:22 -06:00
Simplify Sphinx's Stemmer (#10467)
This commit is contained in:
2
AUTHORS
2
AUTHORS
@@ -96,5 +96,3 @@ authors and projects:
|
||||
|
||||
* sphinx.util.jsdump uses the basestring encoding from simplejson,
|
||||
written by Bob Ippolito, released under the MIT license
|
||||
* sphinx.util.stemmer was written by Vivake Gupta, placed in the
|
||||
Public Domain
|
||||
|
||||
3
CHANGES
3
CHANGES
@@ -10,6 +10,9 @@ Incompatible changes
|
||||
Deprecated
|
||||
----------
|
||||
|
||||
* #10467: Deprecated ``sphinx.util.stemmer`` in favour of ``snowballstemmer``.
|
||||
Patch by Adam Turner.
|
||||
|
||||
Features added
|
||||
--------------
|
||||
|
||||
|
||||
@@ -22,6 +22,11 @@ The following is a list of deprecated interfaces.
|
||||
- (will be) Removed
|
||||
- Alternatives
|
||||
|
||||
* - ``sphinx.util.stemmer``
|
||||
- 5.1
|
||||
- 7.0
|
||||
- ``snowballstemmer``
|
||||
|
||||
* - ``sphinx.util.jsdump``
|
||||
- 5.0
|
||||
- 7.0
|
||||
|
||||
@@ -2,8 +2,9 @@
|
||||
|
||||
from typing import Dict
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
from sphinx.search import SearchLanguage
|
||||
from sphinx.util.stemmer import get_stemmer
|
||||
|
||||
english_stopwords = set("""
|
||||
a and are as at
|
||||
@@ -211,7 +212,7 @@ class SearchEnglish(SearchLanguage):
|
||||
stopwords = english_stopwords
|
||||
|
||||
def init(self, options: Dict) -> None:
|
||||
self.stemmer = get_stemmer()
|
||||
self.stemmer = snowballstemmer.stemmer('porter')
|
||||
|
||||
def stem(self, word: str) -> str:
|
||||
return self.stemmer.stem(word.lower())
|
||||
return self.stemmer.stemWord(word.lower())
|
||||
|
||||
@@ -4,8 +4,9 @@ import os
|
||||
import re
|
||||
from typing import Dict, List
|
||||
|
||||
import snowballstemmer
|
||||
|
||||
from sphinx.search import SearchLanguage
|
||||
from sphinx.util.stemmer import get_stemmer
|
||||
|
||||
try:
|
||||
import jieba
|
||||
@@ -230,7 +231,7 @@ class SearchChinese(SearchLanguage):
|
||||
if dict_path and os.path.isfile(dict_path):
|
||||
jieba.load_userdict(dict_path)
|
||||
|
||||
self.stemmer = get_stemmer()
|
||||
self.stemmer = snowballstemmer.stemmer('english')
|
||||
|
||||
def split(self, input: str) -> List[str]:
|
||||
chinese: List[str] = []
|
||||
@@ -252,8 +253,8 @@ class SearchChinese(SearchLanguage):
|
||||
should_not_be_stemmed = (
|
||||
word in self.latin_terms and
|
||||
len(word) >= 3 and
|
||||
len(self.stemmer.stem(word.lower())) < 3
|
||||
len(self.stemmer.stemWord(word.lower())) < 3
|
||||
)
|
||||
if should_not_be_stemmed:
|
||||
return word.lower()
|
||||
return self.stemmer.stem(word.lower())
|
||||
return self.stemmer.stemWord(word.lower())
|
||||
|
||||
@@ -1,37 +1,62 @@
|
||||
"""Word stemming utilities for Sphinx."""
|
||||
|
||||
from sphinx.util.stemmer.porter import PorterStemmer
|
||||
import warnings
|
||||
|
||||
try:
|
||||
from Stemmer import Stemmer as _PyStemmer
|
||||
PYSTEMMER = True
|
||||
except ImportError:
|
||||
PYSTEMMER = False
|
||||
import snowballstemmer
|
||||
|
||||
from sphinx.deprecation import RemovedInSphinx70Warning
|
||||
|
||||
|
||||
class PorterStemmer:
|
||||
def __init__(self):
|
||||
warnings.warn(f"{self.__class__.__name__} is deprecated, use "
|
||||
"snowballstemmer.stemmer('porter') instead.",
|
||||
RemovedInSphinx70Warning, stacklevel=2)
|
||||
self.stemmer = snowballstemmer.stemmer('porter')
|
||||
|
||||
def stem(self, p: str, i: int, j: int) -> str:
|
||||
warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
|
||||
"snowballstemmer.stemmer('porter').stemWord() instead.",
|
||||
RemovedInSphinx70Warning, stacklevel=2)
|
||||
return self.stemmer.stemWord(p)
|
||||
|
||||
|
||||
class BaseStemmer:
|
||||
def __init__(self):
|
||||
warnings.warn(f"{self.__class__.__name__} is deprecated, use "
|
||||
"snowballstemmer.stemmer('porter') instead.",
|
||||
RemovedInSphinx70Warning, stacklevel=3)
|
||||
|
||||
def stem(self, word: str) -> str:
|
||||
raise NotImplementedError()
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class PyStemmer(BaseStemmer):
|
||||
def __init__(self) -> None:
|
||||
self.stemmer = _PyStemmer('porter')
|
||||
def __init__(self): # NoQA
|
||||
super().__init__()
|
||||
self.stemmer = snowballstemmer.stemmer('porter')
|
||||
|
||||
def stem(self, word: str) -> str:
|
||||
warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
|
||||
"snowballstemmer.stemmer('porter').stemWord() instead.",
|
||||
RemovedInSphinx70Warning, stacklevel=2)
|
||||
return self.stemmer.stemWord(word)
|
||||
|
||||
|
||||
class StandardStemmer(PorterStemmer, BaseStemmer):
|
||||
"""All those porter stemmer implementations look hideous;
|
||||
make at least the stem method nicer.
|
||||
"""
|
||||
def stem(self, word: str) -> str: # type: ignore
|
||||
return super().stem(word, 0, len(word) - 1)
|
||||
class StandardStemmer(BaseStemmer):
|
||||
def __init__(self): # NoQA
|
||||
super().__init__()
|
||||
self.stemmer = snowballstemmer.stemmer('porter')
|
||||
|
||||
def stem(self, word: str) -> str:
|
||||
warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
|
||||
"snowballstemmer.stemmer('porter').stemWord() instead.",
|
||||
RemovedInSphinx70Warning, stacklevel=2)
|
||||
return self.stemmer.stemWord(word)
|
||||
|
||||
|
||||
def get_stemmer() -> BaseStemmer:
|
||||
if PYSTEMMER:
|
||||
warnings.warn("get_stemmer() is deprecated, use "
|
||||
"snowballstemmer.stemmer('porter') instead.",
|
||||
RemovedInSphinx70Warning, stacklevel=2)
|
||||
return PyStemmer()
|
||||
else:
|
||||
return StandardStemmer()
|
||||
|
||||
@@ -1,406 +0,0 @@
|
||||
"""Porter Stemming Algorithm
|
||||
|
||||
This is the Porter stemming algorithm, ported to Python from the
|
||||
version coded up in ANSI C by the author. It may be be regarded
|
||||
as canonical, in that it follows the algorithm presented in
|
||||
|
||||
Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
||||
no. 3, pp 130-137,
|
||||
|
||||
only differing from it at the points made --DEPARTURE-- below.
|
||||
|
||||
See also https://tartarus.org/martin/PorterStemmer/
|
||||
|
||||
The algorithm as described in the paper could be exactly replicated
|
||||
by adjusting the points of DEPARTURE, but this is barely necessary,
|
||||
because (a) the points of DEPARTURE are definitely improvements, and
|
||||
(b) no encoding of the Porter stemmer I have seen is anything like
|
||||
as exact as this version, even with the points of DEPARTURE!
|
||||
|
||||
Release 1: January 2001
|
||||
|
||||
:author: Vivake Gupta <v@nano.com>.
|
||||
:license: Public Domain ("can be used free of charge for any purpose").
|
||||
"""
|
||||
|
||||
|
||||
class PorterStemmer:
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""The main part of the stemming algorithm starts here.
|
||||
b is a buffer holding a word to be stemmed. The letters are in b[k0],
|
||||
b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
|
||||
readjusted downwards as the stemming progresses. Zero termination is
|
||||
not in fact used in the algorithm.
|
||||
|
||||
Note that only lower case sequences are stemmed. Forcing to lower case
|
||||
should be done before stem(...) is called.
|
||||
"""
|
||||
|
||||
self.b = "" # buffer for word to be stemmed
|
||||
self.k = 0
|
||||
self.k0 = 0
|
||||
self.j = 0 # j is a general offset into the string
|
||||
|
||||
def cons(self, i: int) -> int:
|
||||
"""cons(i) is TRUE <=> b[i] is a consonant."""
|
||||
if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' \
|
||||
or self.b[i] == 'o' or self.b[i] == 'u':
|
||||
return 0
|
||||
if self.b[i] == 'y':
|
||||
if i == self.k0:
|
||||
return 1
|
||||
else:
|
||||
return (not self.cons(i - 1))
|
||||
return 1
|
||||
|
||||
def m(self) -> int:
|
||||
"""m() measures the number of consonant sequences between k0 and j.
|
||||
if c is a consonant sequence and v a vowel sequence, and <..>
|
||||
indicates arbitrary presence,
|
||||
|
||||
<c><v> gives 0
|
||||
<c>vc<v> gives 1
|
||||
<c>vcvc<v> gives 2
|
||||
<c>vcvcvc<v> gives 3
|
||||
....
|
||||
"""
|
||||
n = 0
|
||||
i = self.k0
|
||||
while 1:
|
||||
if i > self.j:
|
||||
return n
|
||||
if not self.cons(i):
|
||||
break
|
||||
i = i + 1
|
||||
i = i + 1
|
||||
while 1:
|
||||
while 1:
|
||||
if i > self.j:
|
||||
return n
|
||||
if self.cons(i):
|
||||
break
|
||||
i = i + 1
|
||||
i = i + 1
|
||||
n = n + 1
|
||||
while 1:
|
||||
if i > self.j:
|
||||
return n
|
||||
if not self.cons(i):
|
||||
break
|
||||
i = i + 1
|
||||
i = i + 1
|
||||
|
||||
def vowelinstem(self) -> int:
|
||||
"""vowelinstem() is TRUE <=> k0,...j contains a vowel"""
|
||||
for i in range(self.k0, self.j + 1):
|
||||
if not self.cons(i):
|
||||
return 1
|
||||
return 0
|
||||
|
||||
def doublec(self, j: int) -> int:
|
||||
"""doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
|
||||
if j < (self.k0 + 1):
|
||||
return 0
|
||||
if (self.b[j] != self.b[j - 1]):
|
||||
return 0
|
||||
return self.cons(j)
|
||||
|
||||
def cvc(self, i: int) -> int:
|
||||
"""cvc(i) is TRUE <=> i-2,i-1,i has the form
|
||||
consonant - vowel - consonant
|
||||
and also if the second c is not w,x or y. this is used when trying to
|
||||
restore an e at the end of a short e.g.
|
||||
|
||||
cav(e), lov(e), hop(e), crim(e), but
|
||||
snow, box, tray.
|
||||
"""
|
||||
if i < (self.k0 + 2) or not self.cons(i) or self.cons(i - 1) \
|
||||
or not self.cons(i - 2):
|
||||
return 0
|
||||
ch = self.b[i]
|
||||
if ch in ('w', 'x', 'y'):
|
||||
return 0
|
||||
return 1
|
||||
|
||||
def ends(self, s: str) -> int:
|
||||
"""ends(s) is TRUE <=> k0,...k ends with the string s."""
|
||||
length = len(s)
|
||||
if s[length - 1] != self.b[self.k]: # tiny speed-up
|
||||
return 0
|
||||
if length > (self.k - self.k0 + 1):
|
||||
return 0
|
||||
if self.b[self.k - length + 1:self.k + 1] != s:
|
||||
return 0
|
||||
self.j = self.k - length
|
||||
return 1
|
||||
|
||||
def setto(self, s: str) -> None:
|
||||
"""setto(s) sets (j+1),...k to the characters in the string s,
|
||||
readjusting k."""
|
||||
length = len(s)
|
||||
self.b = self.b[:self.j + 1] + s + self.b[self.j + length + 1:]
|
||||
self.k = self.j + length
|
||||
|
||||
def r(self, s: str) -> None:
|
||||
"""r(s) is used further down."""
|
||||
if self.m() > 0:
|
||||
self.setto(s)
|
||||
|
||||
def step1ab(self) -> None:
|
||||
"""step1ab() gets rid of plurals and -ed or -ing. e.g.
|
||||
|
||||
caresses -> caress
|
||||
ponies -> poni
|
||||
ties -> ti
|
||||
caress -> caress
|
||||
cats -> cat
|
||||
|
||||
feed -> feed
|
||||
agreed -> agree
|
||||
disabled -> disable
|
||||
|
||||
matting -> mat
|
||||
mating -> mate
|
||||
meeting -> meet
|
||||
milling -> mill
|
||||
messing -> mess
|
||||
|
||||
meetings -> meet
|
||||
"""
|
||||
if self.b[self.k] == 's':
|
||||
if self.ends("sses"):
|
||||
self.k = self.k - 2
|
||||
elif self.ends("ies"):
|
||||
self.setto("i")
|
||||
elif self.b[self.k - 1] != 's':
|
||||
self.k = self.k - 1
|
||||
if self.ends("eed"):
|
||||
if self.m() > 0:
|
||||
self.k = self.k - 1
|
||||
elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
|
||||
self.k = self.j
|
||||
if self.ends("at"):
|
||||
self.setto("ate")
|
||||
elif self.ends("bl"):
|
||||
self.setto("ble")
|
||||
elif self.ends("iz"):
|
||||
self.setto("ize")
|
||||
elif self.doublec(self.k):
|
||||
self.k = self.k - 1
|
||||
ch = self.b[self.k]
|
||||
if ch in ('l', 's', 'z'):
|
||||
self.k = self.k + 1
|
||||
elif (self.m() == 1 and self.cvc(self.k)):
|
||||
self.setto("e")
|
||||
|
||||
def step1c(self) -> None:
|
||||
"""step1c() turns terminal y to i when there is another vowel in
|
||||
the stem."""
|
||||
if (self.ends("y") and self.vowelinstem()):
|
||||
self.b = self.b[:self.k] + 'i' + self.b[self.k + 1:]
|
||||
|
||||
def step2(self) -> None:
|
||||
"""step2() maps double suffices to single ones.
|
||||
so -ization ( = -ize plus -ation) maps to -ize etc. note that the
|
||||
string before the suffix must give m() > 0.
|
||||
"""
|
||||
if self.b[self.k - 1] == 'a':
|
||||
if self.ends("ational"):
|
||||
self.r("ate")
|
||||
elif self.ends("tional"):
|
||||
self.r("tion")
|
||||
elif self.b[self.k - 1] == 'c':
|
||||
if self.ends("enci"):
|
||||
self.r("ence")
|
||||
elif self.ends("anci"):
|
||||
self.r("ance")
|
||||
elif self.b[self.k - 1] == 'e':
|
||||
if self.ends("izer"):
|
||||
self.r("ize")
|
||||
elif self.b[self.k - 1] == 'l':
|
||||
if self.ends("bli"):
|
||||
self.r("ble") # --DEPARTURE--
|
||||
# To match the published algorithm, replace this phrase with
|
||||
# if self.ends("abli"): self.r("able")
|
||||
elif self.ends("alli"):
|
||||
self.r("al")
|
||||
elif self.ends("entli"):
|
||||
self.r("ent")
|
||||
elif self.ends("eli"):
|
||||
self.r("e")
|
||||
elif self.ends("ousli"):
|
||||
self.r("ous")
|
||||
elif self.b[self.k - 1] == 'o':
|
||||
if self.ends("ization"):
|
||||
self.r("ize")
|
||||
elif self.ends("ation"):
|
||||
self.r("ate")
|
||||
elif self.ends("ator"):
|
||||
self.r("ate")
|
||||
elif self.b[self.k - 1] == 's':
|
||||
if self.ends("alism"):
|
||||
self.r("al")
|
||||
elif self.ends("iveness"):
|
||||
self.r("ive")
|
||||
elif self.ends("fulness"):
|
||||
self.r("ful")
|
||||
elif self.ends("ousness"):
|
||||
self.r("ous")
|
||||
elif self.b[self.k - 1] == 't':
|
||||
if self.ends("aliti"):
|
||||
self.r("al")
|
||||
elif self.ends("iviti"):
|
||||
self.r("ive")
|
||||
elif self.ends("biliti"):
|
||||
self.r("ble")
|
||||
elif self.b[self.k - 1] == 'g': # --DEPARTURE--
|
||||
if self.ends("logi"):
|
||||
self.r("log")
|
||||
# To match the published algorithm, delete this phrase
|
||||
|
||||
def step3(self) -> None:
|
||||
"""step3() dels with -ic-, -full, -ness etc. similar strategy
|
||||
to step2."""
|
||||
if self.b[self.k] == 'e':
|
||||
if self.ends("icate"):
|
||||
self.r("ic")
|
||||
elif self.ends("ative"):
|
||||
self.r("")
|
||||
elif self.ends("alize"):
|
||||
self.r("al")
|
||||
elif self.b[self.k] == 'i':
|
||||
if self.ends("iciti"):
|
||||
self.r("ic")
|
||||
elif self.b[self.k] == 'l':
|
||||
if self.ends("ical"):
|
||||
self.r("ic")
|
||||
elif self.ends("ful"):
|
||||
self.r("")
|
||||
elif self.b[self.k] == 's':
|
||||
if self.ends("ness"):
|
||||
self.r("")
|
||||
|
||||
def step4(self) -> None:
|
||||
"""step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
|
||||
if self.b[self.k - 1] == 'a':
|
||||
if self.ends("al"):
|
||||
pass
|
||||
else:
|
||||
return
|
||||
elif self.b[self.k - 1] == 'c':
|
||||
if self.ends("ance"):
|
||||
pass
|
||||
elif self.ends("ence"):
|
||||
pass
|
||||
else:
|
||||
return
|
||||
elif self.b[self.k - 1] == 'e':
|
||||
if self.ends("er"):
|
||||
pass
|
||||
else:
|
||||
return
|
||||
elif self.b[self.k - 1] == 'i':
|
||||
if self.ends("ic"):
|
||||
pass
|
||||
else:
|
||||
return
|
||||
elif self.b[self.k - 1] == 'l':
|
||||
if self.ends("able"):
|
||||
pass
|
||||
elif self.ends("ible"):
|
||||
pass
|
||||
else:
|
||||
return
|
||||
elif self.b[self.k - 1] == 'n':
|
||||
if self.ends("ant"):
|
||||
pass
|
||||
elif self.ends("ement"):
|
||||
pass
|
||||
elif self.ends("ment"):
|
||||
pass
|
||||
elif self.ends("ent"):
|
||||
pass
|
||||
else:
|
||||
return
|
||||
elif self.b[self.k - 1] == 'o':
|
||||
if self.ends("ion") and (self.b[self.j] == 's' or
|
||||
self.b[self.j] == 't'):
|
||||
pass
|
||||
elif self.ends("ou"):
|
||||
pass
|
||||
# takes care of -ous
|
||||
else:
|
||||
return
|
||||
elif self.b[self.k - 1] == 's':
|
||||
if self.ends("ism"):
|
||||
pass
|
||||
else:
|
||||
return
|
||||
elif self.b[self.k - 1] == 't':
|
||||
if self.ends("ate"):
|
||||
pass
|
||||
elif self.ends("iti"):
|
||||
pass
|
||||
else:
|
||||
return
|
||||
elif self.b[self.k - 1] == 'u':
|
||||
if self.ends("ous"):
|
||||
pass
|
||||
else:
|
||||
return
|
||||
elif self.b[self.k - 1] == 'v':
|
||||
if self.ends("ive"):
|
||||
pass
|
||||
else:
|
||||
return
|
||||
elif self.b[self.k - 1] == 'z':
|
||||
if self.ends("ize"):
|
||||
pass
|
||||
else:
|
||||
return
|
||||
else:
|
||||
return
|
||||
if self.m() > 1:
|
||||
self.k = self.j
|
||||
|
||||
def step5(self) -> None:
|
||||
"""step5() removes a final -e if m() > 1, and changes -ll to -l if
|
||||
m() > 1.
|
||||
"""
|
||||
self.j = self.k
|
||||
if self.b[self.k] == 'e':
|
||||
a = self.m()
|
||||
if a > 1 or (a == 1 and not self.cvc(self.k - 1)):
|
||||
self.k = self.k - 1
|
||||
if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
|
||||
self.k = self.k - 1
|
||||
|
||||
def stem(self, p: str, i: int, j: int) -> str:
|
||||
"""In stem(p,i,j), p is a char pointer, and the string to be stemmed
|
||||
is from p[i] to p[j] inclusive. Typically i is zero and j is the
|
||||
offset to the last character of a string, (p[j+1] == '\0'). The
|
||||
stemmer adjusts the characters p[i] ... p[j] and returns the new
|
||||
end-point of the string, k. Stemming never increases word length, so
|
||||
i <= k <= j. To turn the stemmer into a module, declare 'stem' as
|
||||
extern, and delete the remainder of this file.
|
||||
"""
|
||||
# copy the parameters into statics
|
||||
self.b = p
|
||||
self.k = j
|
||||
self.k0 = i
|
||||
if self.k <= self.k0 + 1:
|
||||
return self.b # --DEPARTURE--
|
||||
|
||||
# With this line, strings of length 1 or 2 don't go through the
|
||||
# stemming process, although no mention is made of this in the
|
||||
# published algorithm. Remove the line to match the published
|
||||
# algorithm.
|
||||
|
||||
self.step1ab()
|
||||
self.step1c()
|
||||
self.step2()
|
||||
self.step3()
|
||||
self.step4()
|
||||
self.step5()
|
||||
return self.b[self.k0:self.k + 1]
|
||||
Reference in New Issue
Block a user