SPPAS integration of the identification of stop words in a tier.
Module sppas.src.annotations
Class sppasStopWords
Description
Constructor
Create a new instance.
Parameters
- log: (sppasLog) Human-readable logs.
View Source
def __init__(self, log=None):
"""Create a new instance.
:param log: (sppasLog) Human-readable logs.
"""
super(sppasStopWords, self).__init__('stopwords.json', log)
self._stops = StopWords()
Public functions
fix_options
Fix all options.
Parameters
- options: list of sppasOption instances
View Source
def fix_options(self, options):
"""Fix all options.
:param options: list of sppasOption instances
"""
for opt in options:
key = opt.get_key()
if 'alpha' == key:
self.set_alpha(opt.get_value())
elif 'tiername' == key:
self.set_tiername(opt.get_value())
elif 'pattern' in key:
self._options[key] = opt.get_value()
else:
raise AnnotationOptionError(key)
set_alpha
Fix the alpha option.
Alpha is a coefficient to add specific stop-words in the list.
Parameters
- alpha: (float)
View Source
def set_alpha(self, alpha):
"""Fix the alpha option.
Alpha is a coefficient to add specific stop-words in the list.
:param alpha: (float)
"""
self._stops.set_alpha(alpha)
self._options['alpha'] = alpha
set_tiername
Fix the tiername option.
Parameters
- tier_name: (str)
View Source
def set_tiername(self, tier_name):
"""Fix the tiername option.
:param tier_name: (str)
"""
self._options['tiername'] = sppasUnicode(tier_name).to_strip()
load_resources
Load a list of stop-words and replacements.
Override the existing loaded lists...
Parameters
- lang_resources: (str) File with extension '.stp' or nothing
- lang: (str)
View Source
def load_resources(self, lang_resources, lang=None):
"""Load a list of stop-words and replacements.
Override the existing loaded lists...
:param lang_resources: (str) File with extension '.stp' or nothing
:param lang: (str)
"""
fn, fe = os.path.splitext(lang_resources)
try:
stp = fn + '.stp'
self._stops.load(stp, merge=False)
self.logfile.print_message('The initial list contains {:d} stop-words'.format(len(self._stops)), indent=0)
except Exception as e:
self._stops.clear()
self.logfile.print_message('No stop-words loaded: {:s}'.format(str(e)), indent=1)
make_stp_tier
Return a tier indicating if entries are stop-words.
Parameters
- tier: (sppasTier)
View Source
def make_stp_tier(self, tier):
"""Return a tier indicating if entries are stop-words.
:param tier: (sppasTier)
"""
stops = self._stops.copy()
nb = stops.evaluate(tier, merge=True)
self.logfile.print_message('Number of stop-words evaluated: {:d}'.format(nb), indent=1)
self.logfile.print_message('The list contains {:d} stop-words'.format(len(stops)), indent=1)
logging.info('Vocabulary size: {:d}'.format(stops.get_v()))
logging.info('Threshold proba: {:f}'.format(stops.get_threshold()))
stp_tier = sppasTier('IsStopWord')
for ann in tier:
new_labels = list()
for label in ann.get_labels():
tag = label.get_best()
content = tag.get_content()
if content not in symbols.all:
stp = stops.is_in(content)
new_labels.append(sppasLabel(sppasTag(stp, tag_type='bool')))
stp_tier.create_annotation(ann.get_location().copy(), new_labels)
return stp_tier
get_inputs
Return the the tier with aligned tokens.
Parameters
- input_files: (list)
Raises
NoTierInputError
Returns
- (sppasTier)
View Source
def get_inputs(self, input_files):
"""Return the the tier with aligned tokens.
:param input_files: (list)
:raise: NoTierInputError
:return: (sppasTier)
"""
tier = None
annot_ext = self.get_input_extensions()
for filename in input_files:
fn, fe = os.path.splitext(filename)
if tier is None and fe in annot_ext[0]:
parser = sppasTrsRW(filename)
trs_input = parser.read()
tier = trs_input.find(self._options['tiername'], case_sensitive=False)
if tier is not None:
return tier
logging.error('A tier with name {:s} was not found.'.format(self._options['tiername']))
raise NoTierInputError
run
Run the automatic annotation process on an input.
Parameters
- input_files: (list of str) Time-aligned tokens
- output: (str) the output file name
Returns
- (sppasTranscription)
View Source
def run(self, input_files, output=None):
"""Run the automatic annotation process on an input.
:param input_files: (list of str) Time-aligned tokens
:param output: (str) the output file name
:returns: (sppasTranscription)
"""
tier = self.get_inputs(input_files)
stp_tier = self.make_stp_tier(tier)
trs_output = sppasTranscription(self.name)
trs_output.set_meta('annotation_result_of', input_files[0])
trs_output.append(stp_tier)
if output is not None:
output_file = self.fix_out_file_ext(output)
if len(trs_output) > 0:
parser = sppasTrsRW(output_file)
parser.write(trs_output)
return [output_file]
else:
raise EmptyOutputError
return trs_output
get_output_pattern
Pattern this annotation uses in an output filename.
View Source
def get_output_pattern(self):
"""Pattern this annotation uses in an output filename."""
return self._options.get('outputpattern', '-stops')