Sort of basic lemmatization.
Module sppas.src.resources
Class sppasWordStrain
Description
Constructor
Create a WordStain instance.
Parameters
- filename: (str) 2 or 3 columns file with word/freq/wordstrain
View Source
def __init__(self, filename=None):
"""Create a WordStain instance.
:param filename: (str) 2 or 3 columns file with word/freq/wordstrain
"""
super(sppasWordStrain, self).__init__(dict_filename=None, nodump=True)
self.load(filename)
Public functions
load
Load word substitutions from a file.
Replace the existing substitutions.
Parameters
- filename: (str) 2 or 3 columns file with word/freq/replacement
View Source
def load(self, filename):
"""Load word substitutions from a file.
Replace the existing substitutions.
:param filename: (str) 2 or 3 columns file with word/freq/replacement
"""
if filename is None:
return
with codecs.open(filename, 'r', sg.__encoding__) as fd:
try:
line = fd.readline()
except UnicodeDecodeError:
raise FileUnicodeError(filename=filename)
fd.close()
content = line.split()
if len(content) < 3:
self.load_from_ascii(filename)
else:
self.__load_with_freq(filename)
Protected functions
__load_with_freq
Load a replacement dictionary from a 3-columns ascii file.
Parameters
- filename: (str) Replacement dictionary file name
View Source
def __load_with_freq(self, filename):
"""Load a replacement dictionary from a 3-columns ascii file.
:param filename: (str) Replacement dictionary file name
"""
with codecs.open(filename, 'r', sg.__encoding__) as fd:
try:
lines = fd.readlines()
except UnicodeDecodeError:
raise FileUnicodeError(filename=filename)
fd.close()
self.__filename = filename
frequency = {}
for line in lines:
line = ' '.join(line.split())
if len(line) == 0:
continue
tab_line = line.split()
if len(tab_line) < 2:
continue
key = tab_line[0].lower()
freq = int(tab_line[1])
value = sppasDictRepl.REPLACE_SEPARATOR.join(tab_line[2:])
if key in frequency:
if freq > frequency[key]:
frequency[key] = freq
self.pop(key)
self.add(key, value)
else:
frequency[key] = freq
self.add(key, value)