mirror of
https://github.com/sphinx-doc/sphinx.git
synced 2025-02-25 18:55:22 -06:00
Simplify Sphinx's Stemmer (#10467)
This commit is contained in:
2
AUTHORS
2
AUTHORS
@@ -96,5 +96,3 @@ authors and projects:
|
|||||||
|
|
||||||
* sphinx.util.jsdump uses the basestring encoding from simplejson,
|
* sphinx.util.jsdump uses the basestring encoding from simplejson,
|
||||||
written by Bob Ippolito, released under the MIT license
|
written by Bob Ippolito, released under the MIT license
|
||||||
* sphinx.util.stemmer was written by Vivake Gupta, placed in the
|
|
||||||
Public Domain
|
|
||||||
|
|||||||
3
CHANGES
3
CHANGES
@@ -10,6 +10,9 @@ Incompatible changes
|
|||||||
Deprecated
|
Deprecated
|
||||||
----------
|
----------
|
||||||
|
|
||||||
|
* #10467: Deprecated ``sphinx.util.stemmer`` in favour of ``snowballstemmer``.
|
||||||
|
Patch by Adam Turner.
|
||||||
|
|
||||||
Features added
|
Features added
|
||||||
--------------
|
--------------
|
||||||
|
|
||||||
|
|||||||
@@ -22,6 +22,11 @@ The following is a list of deprecated interfaces.
|
|||||||
- (will be) Removed
|
- (will be) Removed
|
||||||
- Alternatives
|
- Alternatives
|
||||||
|
|
||||||
|
* - ``sphinx.util.stemmer``
|
||||||
|
- 5.1
|
||||||
|
- 7.0
|
||||||
|
- ``snowballstemmer``
|
||||||
|
|
||||||
* - ``sphinx.util.jsdump``
|
* - ``sphinx.util.jsdump``
|
||||||
- 5.0
|
- 5.0
|
||||||
- 7.0
|
- 7.0
|
||||||
|
|||||||
@@ -2,8 +2,9 @@
|
|||||||
|
|
||||||
from typing import Dict
|
from typing import Dict
|
||||||
|
|
||||||
|
import snowballstemmer
|
||||||
|
|
||||||
from sphinx.search import SearchLanguage
|
from sphinx.search import SearchLanguage
|
||||||
from sphinx.util.stemmer import get_stemmer
|
|
||||||
|
|
||||||
english_stopwords = set("""
|
english_stopwords = set("""
|
||||||
a and are as at
|
a and are as at
|
||||||
@@ -211,7 +212,7 @@ class SearchEnglish(SearchLanguage):
|
|||||||
stopwords = english_stopwords
|
stopwords = english_stopwords
|
||||||
|
|
||||||
def init(self, options: Dict) -> None:
|
def init(self, options: Dict) -> None:
|
||||||
self.stemmer = get_stemmer()
|
self.stemmer = snowballstemmer.stemmer('porter')
|
||||||
|
|
||||||
def stem(self, word: str) -> str:
|
def stem(self, word: str) -> str:
|
||||||
return self.stemmer.stem(word.lower())
|
return self.stemmer.stemWord(word.lower())
|
||||||
|
|||||||
@@ -4,8 +4,9 @@ import os
|
|||||||
import re
|
import re
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
|
|
||||||
|
import snowballstemmer
|
||||||
|
|
||||||
from sphinx.search import SearchLanguage
|
from sphinx.search import SearchLanguage
|
||||||
from sphinx.util.stemmer import get_stemmer
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import jieba
|
import jieba
|
||||||
@@ -230,7 +231,7 @@ class SearchChinese(SearchLanguage):
|
|||||||
if dict_path and os.path.isfile(dict_path):
|
if dict_path and os.path.isfile(dict_path):
|
||||||
jieba.load_userdict(dict_path)
|
jieba.load_userdict(dict_path)
|
||||||
|
|
||||||
self.stemmer = get_stemmer()
|
self.stemmer = snowballstemmer.stemmer('english')
|
||||||
|
|
||||||
def split(self, input: str) -> List[str]:
|
def split(self, input: str) -> List[str]:
|
||||||
chinese: List[str] = []
|
chinese: List[str] = []
|
||||||
@@ -252,8 +253,8 @@ class SearchChinese(SearchLanguage):
|
|||||||
should_not_be_stemmed = (
|
should_not_be_stemmed = (
|
||||||
word in self.latin_terms and
|
word in self.latin_terms and
|
||||||
len(word) >= 3 and
|
len(word) >= 3 and
|
||||||
len(self.stemmer.stem(word.lower())) < 3
|
len(self.stemmer.stemWord(word.lower())) < 3
|
||||||
)
|
)
|
||||||
if should_not_be_stemmed:
|
if should_not_be_stemmed:
|
||||||
return word.lower()
|
return word.lower()
|
||||||
return self.stemmer.stem(word.lower())
|
return self.stemmer.stemWord(word.lower())
|
||||||
|
|||||||
@@ -1,37 +1,62 @@
|
|||||||
"""Word stemming utilities for Sphinx."""
|
"""Word stemming utilities for Sphinx."""
|
||||||
|
|
||||||
from sphinx.util.stemmer.porter import PorterStemmer
|
import warnings
|
||||||
|
|
||||||
try:
|
import snowballstemmer
|
||||||
from Stemmer import Stemmer as _PyStemmer
|
|
||||||
PYSTEMMER = True
|
from sphinx.deprecation import RemovedInSphinx70Warning
|
||||||
except ImportError:
|
|
||||||
PYSTEMMER = False
|
|
||||||
|
class PorterStemmer:
|
||||||
|
def __init__(self):
|
||||||
|
warnings.warn(f"{self.__class__.__name__} is deprecated, use "
|
||||||
|
"snowballstemmer.stemmer('porter') instead.",
|
||||||
|
RemovedInSphinx70Warning, stacklevel=2)
|
||||||
|
self.stemmer = snowballstemmer.stemmer('porter')
|
||||||
|
|
||||||
|
def stem(self, p: str, i: int, j: int) -> str:
|
||||||
|
warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
|
||||||
|
"snowballstemmer.stemmer('porter').stemWord() instead.",
|
||||||
|
RemovedInSphinx70Warning, stacklevel=2)
|
||||||
|
return self.stemmer.stemWord(p)
|
||||||
|
|
||||||
|
|
||||||
class BaseStemmer:
|
class BaseStemmer:
|
||||||
|
def __init__(self):
|
||||||
|
warnings.warn(f"{self.__class__.__name__} is deprecated, use "
|
||||||
|
"snowballstemmer.stemmer('porter') instead.",
|
||||||
|
RemovedInSphinx70Warning, stacklevel=3)
|
||||||
|
|
||||||
def stem(self, word: str) -> str:
|
def stem(self, word: str) -> str:
|
||||||
raise NotImplementedError()
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class PyStemmer(BaseStemmer):
|
class PyStemmer(BaseStemmer):
|
||||||
def __init__(self) -> None:
|
def __init__(self): # NoQA
|
||||||
self.stemmer = _PyStemmer('porter')
|
super().__init__()
|
||||||
|
self.stemmer = snowballstemmer.stemmer('porter')
|
||||||
|
|
||||||
def stem(self, word: str) -> str:
|
def stem(self, word: str) -> str:
|
||||||
|
warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
|
||||||
|
"snowballstemmer.stemmer('porter').stemWord() instead.",
|
||||||
|
RemovedInSphinx70Warning, stacklevel=2)
|
||||||
return self.stemmer.stemWord(word)
|
return self.stemmer.stemWord(word)
|
||||||
|
|
||||||
|
|
||||||
class StandardStemmer(PorterStemmer, BaseStemmer):
|
class StandardStemmer(BaseStemmer):
|
||||||
"""All those porter stemmer implementations look hideous;
|
def __init__(self): # NoQA
|
||||||
make at least the stem method nicer.
|
super().__init__()
|
||||||
"""
|
self.stemmer = snowballstemmer.stemmer('porter')
|
||||||
def stem(self, word: str) -> str: # type: ignore
|
|
||||||
return super().stem(word, 0, len(word) - 1)
|
def stem(self, word: str) -> str:
|
||||||
|
warnings.warn(f"{self.__class__.__name__}.stem() is deprecated, use "
|
||||||
|
"snowballstemmer.stemmer('porter').stemWord() instead.",
|
||||||
|
RemovedInSphinx70Warning, stacklevel=2)
|
||||||
|
return self.stemmer.stemWord(word)
|
||||||
|
|
||||||
|
|
||||||
def get_stemmer() -> BaseStemmer:
|
def get_stemmer() -> BaseStemmer:
|
||||||
if PYSTEMMER:
|
warnings.warn("get_stemmer() is deprecated, use "
|
||||||
return PyStemmer()
|
"snowballstemmer.stemmer('porter') instead.",
|
||||||
else:
|
RemovedInSphinx70Warning, stacklevel=2)
|
||||||
return StandardStemmer()
|
return PyStemmer()
|
||||||
|
|||||||
@@ -1,406 +0,0 @@
|
|||||||
"""Porter Stemming Algorithm
|
|
||||||
|
|
||||||
This is the Porter stemming algorithm, ported to Python from the
|
|
||||||
version coded up in ANSI C by the author. It may be be regarded
|
|
||||||
as canonical, in that it follows the algorithm presented in
|
|
||||||
|
|
||||||
Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
|
|
||||||
no. 3, pp 130-137,
|
|
||||||
|
|
||||||
only differing from it at the points made --DEPARTURE-- below.
|
|
||||||
|
|
||||||
See also https://tartarus.org/martin/PorterStemmer/
|
|
||||||
|
|
||||||
The algorithm as described in the paper could be exactly replicated
|
|
||||||
by adjusting the points of DEPARTURE, but this is barely necessary,
|
|
||||||
because (a) the points of DEPARTURE are definitely improvements, and
|
|
||||||
(b) no encoding of the Porter stemmer I have seen is anything like
|
|
||||||
as exact as this version, even with the points of DEPARTURE!
|
|
||||||
|
|
||||||
Release 1: January 2001
|
|
||||||
|
|
||||||
:author: Vivake Gupta <v@nano.com>.
|
|
||||||
:license: Public Domain ("can be used free of charge for any purpose").
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
class PorterStemmer:
|
|
||||||
|
|
||||||
def __init__(self) -> None:
|
|
||||||
"""The main part of the stemming algorithm starts here.
|
|
||||||
b is a buffer holding a word to be stemmed. The letters are in b[k0],
|
|
||||||
b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is
|
|
||||||
readjusted downwards as the stemming progresses. Zero termination is
|
|
||||||
not in fact used in the algorithm.
|
|
||||||
|
|
||||||
Note that only lower case sequences are stemmed. Forcing to lower case
|
|
||||||
should be done before stem(...) is called.
|
|
||||||
"""
|
|
||||||
|
|
||||||
self.b = "" # buffer for word to be stemmed
|
|
||||||
self.k = 0
|
|
||||||
self.k0 = 0
|
|
||||||
self.j = 0 # j is a general offset into the string
|
|
||||||
|
|
||||||
def cons(self, i: int) -> int:
|
|
||||||
"""cons(i) is TRUE <=> b[i] is a consonant."""
|
|
||||||
if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' \
|
|
||||||
or self.b[i] == 'o' or self.b[i] == 'u':
|
|
||||||
return 0
|
|
||||||
if self.b[i] == 'y':
|
|
||||||
if i == self.k0:
|
|
||||||
return 1
|
|
||||||
else:
|
|
||||||
return (not self.cons(i - 1))
|
|
||||||
return 1
|
|
||||||
|
|
||||||
def m(self) -> int:
|
|
||||||
"""m() measures the number of consonant sequences between k0 and j.
|
|
||||||
if c is a consonant sequence and v a vowel sequence, and <..>
|
|
||||||
indicates arbitrary presence,
|
|
||||||
|
|
||||||
<c><v> gives 0
|
|
||||||
<c>vc<v> gives 1
|
|
||||||
<c>vcvc<v> gives 2
|
|
||||||
<c>vcvcvc<v> gives 3
|
|
||||||
....
|
|
||||||
"""
|
|
||||||
n = 0
|
|
||||||
i = self.k0
|
|
||||||
while 1:
|
|
||||||
if i > self.j:
|
|
||||||
return n
|
|
||||||
if not self.cons(i):
|
|
||||||
break
|
|
||||||
i = i + 1
|
|
||||||
i = i + 1
|
|
||||||
while 1:
|
|
||||||
while 1:
|
|
||||||
if i > self.j:
|
|
||||||
return n
|
|
||||||
if self.cons(i):
|
|
||||||
break
|
|
||||||
i = i + 1
|
|
||||||
i = i + 1
|
|
||||||
n = n + 1
|
|
||||||
while 1:
|
|
||||||
if i > self.j:
|
|
||||||
return n
|
|
||||||
if not self.cons(i):
|
|
||||||
break
|
|
||||||
i = i + 1
|
|
||||||
i = i + 1
|
|
||||||
|
|
||||||
def vowelinstem(self) -> int:
|
|
||||||
"""vowelinstem() is TRUE <=> k0,...j contains a vowel"""
|
|
||||||
for i in range(self.k0, self.j + 1):
|
|
||||||
if not self.cons(i):
|
|
||||||
return 1
|
|
||||||
return 0
|
|
||||||
|
|
||||||
def doublec(self, j: int) -> int:
|
|
||||||
"""doublec(j) is TRUE <=> j,(j-1) contain a double consonant."""
|
|
||||||
if j < (self.k0 + 1):
|
|
||||||
return 0
|
|
||||||
if (self.b[j] != self.b[j - 1]):
|
|
||||||
return 0
|
|
||||||
return self.cons(j)
|
|
||||||
|
|
||||||
def cvc(self, i: int) -> int:
|
|
||||||
"""cvc(i) is TRUE <=> i-2,i-1,i has the form
|
|
||||||
consonant - vowel - consonant
|
|
||||||
and also if the second c is not w,x or y. this is used when trying to
|
|
||||||
restore an e at the end of a short e.g.
|
|
||||||
|
|
||||||
cav(e), lov(e), hop(e), crim(e), but
|
|
||||||
snow, box, tray.
|
|
||||||
"""
|
|
||||||
if i < (self.k0 + 2) or not self.cons(i) or self.cons(i - 1) \
|
|
||||||
or not self.cons(i - 2):
|
|
||||||
return 0
|
|
||||||
ch = self.b[i]
|
|
||||||
if ch in ('w', 'x', 'y'):
|
|
||||||
return 0
|
|
||||||
return 1
|
|
||||||
|
|
||||||
def ends(self, s: str) -> int:
|
|
||||||
"""ends(s) is TRUE <=> k0,...k ends with the string s."""
|
|
||||||
length = len(s)
|
|
||||||
if s[length - 1] != self.b[self.k]: # tiny speed-up
|
|
||||||
return 0
|
|
||||||
if length > (self.k - self.k0 + 1):
|
|
||||||
return 0
|
|
||||||
if self.b[self.k - length + 1:self.k + 1] != s:
|
|
||||||
return 0
|
|
||||||
self.j = self.k - length
|
|
||||||
return 1
|
|
||||||
|
|
||||||
def setto(self, s: str) -> None:
|
|
||||||
"""setto(s) sets (j+1),...k to the characters in the string s,
|
|
||||||
readjusting k."""
|
|
||||||
length = len(s)
|
|
||||||
self.b = self.b[:self.j + 1] + s + self.b[self.j + length + 1:]
|
|
||||||
self.k = self.j + length
|
|
||||||
|
|
||||||
def r(self, s: str) -> None:
|
|
||||||
"""r(s) is used further down."""
|
|
||||||
if self.m() > 0:
|
|
||||||
self.setto(s)
|
|
||||||
|
|
||||||
def step1ab(self) -> None:
|
|
||||||
"""step1ab() gets rid of plurals and -ed or -ing. e.g.
|
|
||||||
|
|
||||||
caresses -> caress
|
|
||||||
ponies -> poni
|
|
||||||
ties -> ti
|
|
||||||
caress -> caress
|
|
||||||
cats -> cat
|
|
||||||
|
|
||||||
feed -> feed
|
|
||||||
agreed -> agree
|
|
||||||
disabled -> disable
|
|
||||||
|
|
||||||
matting -> mat
|
|
||||||
mating -> mate
|
|
||||||
meeting -> meet
|
|
||||||
milling -> mill
|
|
||||||
messing -> mess
|
|
||||||
|
|
||||||
meetings -> meet
|
|
||||||
"""
|
|
||||||
if self.b[self.k] == 's':
|
|
||||||
if self.ends("sses"):
|
|
||||||
self.k = self.k - 2
|
|
||||||
elif self.ends("ies"):
|
|
||||||
self.setto("i")
|
|
||||||
elif self.b[self.k - 1] != 's':
|
|
||||||
self.k = self.k - 1
|
|
||||||
if self.ends("eed"):
|
|
||||||
if self.m() > 0:
|
|
||||||
self.k = self.k - 1
|
|
||||||
elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem():
|
|
||||||
self.k = self.j
|
|
||||||
if self.ends("at"):
|
|
||||||
self.setto("ate")
|
|
||||||
elif self.ends("bl"):
|
|
||||||
self.setto("ble")
|
|
||||||
elif self.ends("iz"):
|
|
||||||
self.setto("ize")
|
|
||||||
elif self.doublec(self.k):
|
|
||||||
self.k = self.k - 1
|
|
||||||
ch = self.b[self.k]
|
|
||||||
if ch in ('l', 's', 'z'):
|
|
||||||
self.k = self.k + 1
|
|
||||||
elif (self.m() == 1 and self.cvc(self.k)):
|
|
||||||
self.setto("e")
|
|
||||||
|
|
||||||
def step1c(self) -> None:
|
|
||||||
"""step1c() turns terminal y to i when there is another vowel in
|
|
||||||
the stem."""
|
|
||||||
if (self.ends("y") and self.vowelinstem()):
|
|
||||||
self.b = self.b[:self.k] + 'i' + self.b[self.k + 1:]
|
|
||||||
|
|
||||||
def step2(self) -> None:
|
|
||||||
"""step2() maps double suffices to single ones.
|
|
||||||
so -ization ( = -ize plus -ation) maps to -ize etc. note that the
|
|
||||||
string before the suffix must give m() > 0.
|
|
||||||
"""
|
|
||||||
if self.b[self.k - 1] == 'a':
|
|
||||||
if self.ends("ational"):
|
|
||||||
self.r("ate")
|
|
||||||
elif self.ends("tional"):
|
|
||||||
self.r("tion")
|
|
||||||
elif self.b[self.k - 1] == 'c':
|
|
||||||
if self.ends("enci"):
|
|
||||||
self.r("ence")
|
|
||||||
elif self.ends("anci"):
|
|
||||||
self.r("ance")
|
|
||||||
elif self.b[self.k - 1] == 'e':
|
|
||||||
if self.ends("izer"):
|
|
||||||
self.r("ize")
|
|
||||||
elif self.b[self.k - 1] == 'l':
|
|
||||||
if self.ends("bli"):
|
|
||||||
self.r("ble") # --DEPARTURE--
|
|
||||||
# To match the published algorithm, replace this phrase with
|
|
||||||
# if self.ends("abli"): self.r("able")
|
|
||||||
elif self.ends("alli"):
|
|
||||||
self.r("al")
|
|
||||||
elif self.ends("entli"):
|
|
||||||
self.r("ent")
|
|
||||||
elif self.ends("eli"):
|
|
||||||
self.r("e")
|
|
||||||
elif self.ends("ousli"):
|
|
||||||
self.r("ous")
|
|
||||||
elif self.b[self.k - 1] == 'o':
|
|
||||||
if self.ends("ization"):
|
|
||||||
self.r("ize")
|
|
||||||
elif self.ends("ation"):
|
|
||||||
self.r("ate")
|
|
||||||
elif self.ends("ator"):
|
|
||||||
self.r("ate")
|
|
||||||
elif self.b[self.k - 1] == 's':
|
|
||||||
if self.ends("alism"):
|
|
||||||
self.r("al")
|
|
||||||
elif self.ends("iveness"):
|
|
||||||
self.r("ive")
|
|
||||||
elif self.ends("fulness"):
|
|
||||||
self.r("ful")
|
|
||||||
elif self.ends("ousness"):
|
|
||||||
self.r("ous")
|
|
||||||
elif self.b[self.k - 1] == 't':
|
|
||||||
if self.ends("aliti"):
|
|
||||||
self.r("al")
|
|
||||||
elif self.ends("iviti"):
|
|
||||||
self.r("ive")
|
|
||||||
elif self.ends("biliti"):
|
|
||||||
self.r("ble")
|
|
||||||
elif self.b[self.k - 1] == 'g': # --DEPARTURE--
|
|
||||||
if self.ends("logi"):
|
|
||||||
self.r("log")
|
|
||||||
# To match the published algorithm, delete this phrase
|
|
||||||
|
|
||||||
def step3(self) -> None:
|
|
||||||
"""step3() dels with -ic-, -full, -ness etc. similar strategy
|
|
||||||
to step2."""
|
|
||||||
if self.b[self.k] == 'e':
|
|
||||||
if self.ends("icate"):
|
|
||||||
self.r("ic")
|
|
||||||
elif self.ends("ative"):
|
|
||||||
self.r("")
|
|
||||||
elif self.ends("alize"):
|
|
||||||
self.r("al")
|
|
||||||
elif self.b[self.k] == 'i':
|
|
||||||
if self.ends("iciti"):
|
|
||||||
self.r("ic")
|
|
||||||
elif self.b[self.k] == 'l':
|
|
||||||
if self.ends("ical"):
|
|
||||||
self.r("ic")
|
|
||||||
elif self.ends("ful"):
|
|
||||||
self.r("")
|
|
||||||
elif self.b[self.k] == 's':
|
|
||||||
if self.ends("ness"):
|
|
||||||
self.r("")
|
|
||||||
|
|
||||||
def step4(self) -> None:
|
|
||||||
"""step4() takes off -ant, -ence etc., in context <c>vcvc<v>."""
|
|
||||||
if self.b[self.k - 1] == 'a':
|
|
||||||
if self.ends("al"):
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
return
|
|
||||||
elif self.b[self.k - 1] == 'c':
|
|
||||||
if self.ends("ance"):
|
|
||||||
pass
|
|
||||||
elif self.ends("ence"):
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
return
|
|
||||||
elif self.b[self.k - 1] == 'e':
|
|
||||||
if self.ends("er"):
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
return
|
|
||||||
elif self.b[self.k - 1] == 'i':
|
|
||||||
if self.ends("ic"):
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
return
|
|
||||||
elif self.b[self.k - 1] == 'l':
|
|
||||||
if self.ends("able"):
|
|
||||||
pass
|
|
||||||
elif self.ends("ible"):
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
return
|
|
||||||
elif self.b[self.k - 1] == 'n':
|
|
||||||
if self.ends("ant"):
|
|
||||||
pass
|
|
||||||
elif self.ends("ement"):
|
|
||||||
pass
|
|
||||||
elif self.ends("ment"):
|
|
||||||
pass
|
|
||||||
elif self.ends("ent"):
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
return
|
|
||||||
elif self.b[self.k - 1] == 'o':
|
|
||||||
if self.ends("ion") and (self.b[self.j] == 's' or
|
|
||||||
self.b[self.j] == 't'):
|
|
||||||
pass
|
|
||||||
elif self.ends("ou"):
|
|
||||||
pass
|
|
||||||
# takes care of -ous
|
|
||||||
else:
|
|
||||||
return
|
|
||||||
elif self.b[self.k - 1] == 's':
|
|
||||||
if self.ends("ism"):
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
return
|
|
||||||
elif self.b[self.k - 1] == 't':
|
|
||||||
if self.ends("ate"):
|
|
||||||
pass
|
|
||||||
elif self.ends("iti"):
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
return
|
|
||||||
elif self.b[self.k - 1] == 'u':
|
|
||||||
if self.ends("ous"):
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
return
|
|
||||||
elif self.b[self.k - 1] == 'v':
|
|
||||||
if self.ends("ive"):
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
return
|
|
||||||
elif self.b[self.k - 1] == 'z':
|
|
||||||
if self.ends("ize"):
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
return
|
|
||||||
else:
|
|
||||||
return
|
|
||||||
if self.m() > 1:
|
|
||||||
self.k = self.j
|
|
||||||
|
|
||||||
def step5(self) -> None:
|
|
||||||
"""step5() removes a final -e if m() > 1, and changes -ll to -l if
|
|
||||||
m() > 1.
|
|
||||||
"""
|
|
||||||
self.j = self.k
|
|
||||||
if self.b[self.k] == 'e':
|
|
||||||
a = self.m()
|
|
||||||
if a > 1 or (a == 1 and not self.cvc(self.k - 1)):
|
|
||||||
self.k = self.k - 1
|
|
||||||
if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1:
|
|
||||||
self.k = self.k - 1
|
|
||||||
|
|
||||||
def stem(self, p: str, i: int, j: int) -> str:
|
|
||||||
"""In stem(p,i,j), p is a char pointer, and the string to be stemmed
|
|
||||||
is from p[i] to p[j] inclusive. Typically i is zero and j is the
|
|
||||||
offset to the last character of a string, (p[j+1] == '\0'). The
|
|
||||||
stemmer adjusts the characters p[i] ... p[j] and returns the new
|
|
||||||
end-point of the string, k. Stemming never increases word length, so
|
|
||||||
i <= k <= j. To turn the stemmer into a module, declare 'stem' as
|
|
||||||
extern, and delete the remainder of this file.
|
|
||||||
"""
|
|
||||||
# copy the parameters into statics
|
|
||||||
self.b = p
|
|
||||||
self.k = j
|
|
||||||
self.k0 = i
|
|
||||||
if self.k <= self.k0 + 1:
|
|
||||||
return self.b # --DEPARTURE--
|
|
||||||
|
|
||||||
# With this line, strings of length 1 or 2 don't go through the
|
|
||||||
# stemming process, although no mention is made of this in the
|
|
||||||
# published algorithm. Remove the line to match the published
|
|
||||||
# algorithm.
|
|
||||||
|
|
||||||
self.step1ab()
|
|
||||||
self.step1c()
|
|
||||||
self.step2()
|
|
||||||
self.step3()
|
|
||||||
self.step4()
|
|
||||||
self.step5()
|
|
||||||
return self.b[self.k0:self.k + 1]
|
|
||||||
Reference in New Issue
Block a user