Source code for annotations.SelfRepet.sppasbaserepet
"""
:filename: sppas.src.annotations.SelfRepet.sppasbaserepet.py
:author: Brigitte Bigi
:contact: develop@sppas.org
:summary: Base class for SPPAS integration of repetitions detection.
.. _This file is part of SPPAS: http://www.sppas.org/
..
-------------------------------------------------------------------------
___ __ __ __ ___
/ | \ | \ | \ / the automatic
\__ |__/ |__/ |___| \__ annotation and
\ | | | | \ analysis
___/ | | | | ___/ of speech
Copyright (C) 2011-2021 Brigitte Bigi
Laboratoire Parole et Langage, Aix-en-Provence, France
Use of this software is governed by the GNU Public License, version 3.
SPPAS is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
SPPAS is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SPPAS. If not, see <http://www.gnu.org/licenses/>.
This banner notice must not be removed.
-------------------------------------------------------------------------
"""
import os
from sppas.src.config import symbols
from sppas.src.config import IndexRangeException
from sppas.src.resources import sppasWordStrain
from sppas.src.anndata import sppasTier
from sppas.src.anndata import sppasLabel
from sppas.src.anndata import sppasTag
from sppas.src.anndata.aio.aioutils import serialize_labels
from ..baseannot import sppasBaseAnnotation
from ..annotationsexc import AnnotationOptionError
from ..StopWords.stpwds import StopWords
# ---------------------------------------------------------------------------
SIL_ORTHO = list(symbols.ortho.keys())[list(symbols.ortho.values()).index("silence")]
# ---------------------------------------------------------------------------
[docs]class sppasBaseRepet(sppasBaseAnnotation):
"""SPPAS Automatic Any-Repetition Detection.
"""
[docs] def __init__(self, config, log=None):
"""Create a new sppasRepetition instance.
Log is used for a better communication of the annotation process and its
results. If None, logs are redirected to the default logging system.
:param config: (str) Name of the JSON configuration file, without path.
:param log: (sppasLog) Human-readable logs.
"""
# List of options to configure this automatic annotation
self._options = dict()
self._options['span'] = 3
self._options['stopwords'] = True
# Option values can be overridden by those defined in the config file
super(sppasBaseRepet, self).__init__(config, log)
# Members
self.max_span = 8
self._word_strain = sppasWordStrain()
self._stop_words = StopWords()
# -----------------------------------------------------------------------
[docs] def fix_options(self, options):
"""Fix all options.
:param options: list of sppasOption instances
"""
for opt in options:
key = opt.get_key()
if "stopwords" == key:
self.set_use_stopwords(opt.get_value())
elif "span" == key:
self.set_span(opt.get_value())
elif "alpha" == key:
self.set_alpha(opt.get_value())
elif "pattern" in key:
self._options[key] = opt.get_value()
else:
raise AnnotationOptionError(key)
# -----------------------------------------------------------------------
[docs] def load_resources(self, lang_resources, lang=None):
"""Load a list of stop-words and replacements.
Override the existing loaded lists...
:param lang_resources: (str) File with extension '.stp' or '.lem' or nothing
:param lang: (str)
"""
self._word_strain = sppasWordStrain()
fn, fe = os.path.splitext(lang_resources)
try:
stp = fn + '.stp'
self._stop_words.load(stp, merge=False)
self.logfile.print_message(
"The initial list contains {:d} stop-words"
"".format(len(self._stop_words)), indent=0)
except Exception as e:
self._stop_words.clear()
self.logfile.print_message(
"No stop-words loaded: {:s}".format(str(e)), indent=1)
try:
repl = fn + ".lem"
if os.path.exists(repl):
self._word_strain.load(repl)
self.logfile.print_message(
"The replacement list contains {:d} tokens"
"".format(len(self._word_strain)), indent=0)
except Exception as e:
self._word_strain = sppasWordStrain()
self.logfile.print_message(
"No replacement list loaded: {:s}"
"".format(str(e)), indent=1)
# -----------------------------------------------------------------------
# Getters and Setters
# -----------------------------------------------------------------------
[docs] def set_use_stopwords(self, use_stopwords):
"""Fix the use_stopwords option.
If use_stopwords is set to True, sppasRepetition() will add specific
stopwords to the stopwords list (deducted from the input text).
:param use_stopwords: (bool)
"""
self._options['stopwords'] = bool(use_stopwords)
# -----------------------------------------------------------------------
[docs] def set_span(self, span):
"""Fix the span option.
Span is the maximum number of IPUs to search for repetitions.
A value of 1 means to search only in the current IPU.
:param span: (int)
"""
span = int(span)
if 0 < span <= self.max_span:
self._options['span'] = span
else:
raise IndexRangeException(span, 0, self.max_span)
# -----------------------------------------------------------------------
[docs] def set_alpha(self, alpha):
"""Fix the alpha option.
Alpha is a coefficient to add specific stop-words in the list.
:param alpha: (float)
"""
self._stop_words.set_alpha(alpha)
# -----------------------------------------------------------------------
# Make tiers for the result
# -----------------------------------------------------------------------
[docs] def make_word_strain(self, tier):
"""Return a tier with modified tokens.
:param tier: (sppasTier) Time-aligned tokens.
"""
if len(self._word_strain) == 0:
return tier
self.logfile.print_message("Words strain enabled.", indent=1, status=2)
lems_tier = sppasTier('TokenStrain')
for ann in tier:
token = serialize_labels(ann.get_labels())
lem = self._word_strain.get(token, token)
lems_tier.create_annotation(
ann.get_location().copy(),
sppasLabel(sppasTag(lem))
)
return lems_tier
# -----------------------------------------------------------------------
[docs] def make_stop_words(self, tier):
"""Return a tier indicating if entries are stop-words.
:param tier: (sppasTier) Time-aligned tokens.
"""
stp_tier = sppasTier('StopWord')
for ann in tier:
token = serialize_labels(ann.get_labels())
if token not in symbols.all:
stp = self._stop_words.is_in(token)
stp_tier.create_annotation(
ann.get_location().copy(),
sppasLabel(sppasTag(stp, tag_type="bool"))
)
return stp_tier