Source code for annotations.SelfRepet.sppasbaserepet

"""
:filename: sppas.src.annotations.SelfRepet.sppasbaserepet.py
:author:   Brigitte Bigi
:contact:  develop@sppas.org
:summary:  Base class for SPPAS integration of repetitions detection.

.. _This file is part of SPPAS: http://www.sppas.org/
..
    -------------------------------------------------------------------------

     ___   __    __    __    ___
    /     |  \  |  \  |  \  /              the automatic
    \__   |__/  |__/  |___| \__             annotation and
       \  |     |     |   |    \             analysis
    ___/  |     |     |   | ___/              of speech

    Copyright (C) 2011-2021  Brigitte Bigi
    Laboratoire Parole et Langage, Aix-en-Provence, France

    Use of this software is governed by the GNU Public License, version 3.

    SPPAS is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    SPPAS is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with SPPAS. If not, see <http://www.gnu.org/licenses/>.

    This banner notice must not be removed.

    -------------------------------------------------------------------------

"""

import os

from sppas.src.config import symbols
from sppas.src.config import IndexRangeException
from sppas.src.resources import sppasWordStrain
from sppas.src.anndata import sppasTier
from sppas.src.anndata import sppasLabel
from sppas.src.anndata import sppasTag
from sppas.src.anndata.aio.aioutils import serialize_labels

from ..baseannot import sppasBaseAnnotation
from ..annotationsexc import AnnotationOptionError
from ..StopWords.stpwds import StopWords

# ---------------------------------------------------------------------------

SIL_ORTHO = list(symbols.ortho.keys())[list(symbols.ortho.values()).index("silence")]

# ---------------------------------------------------------------------------


[docs]class sppasBaseRepet(sppasBaseAnnotation): """SPPAS Automatic Any-Repetition Detection. """
[docs] def __init__(self, config, log=None): """Create a new sppasRepetition instance. Log is used for a better communication of the annotation process and its results. If None, logs are redirected to the default logging system. :param config: (str) Name of the JSON configuration file, without path. :param log: (sppasLog) Human-readable logs. """ # List of options to configure this automatic annotation self._options = dict() self._options['span'] = 3 self._options['stopwords'] = True # Option values can be overridden by those defined in the config file super(sppasBaseRepet, self).__init__(config, log) # Members self.max_span = 8 self._word_strain = sppasWordStrain() self._stop_words = StopWords()
# -----------------------------------------------------------------------
[docs] def fix_options(self, options): """Fix all options. :param options: list of sppasOption instances """ for opt in options: key = opt.get_key() if "stopwords" == key: self.set_use_stopwords(opt.get_value()) elif "span" == key: self.set_span(opt.get_value()) elif "alpha" == key: self.set_alpha(opt.get_value()) elif "pattern" in key: self._options[key] = opt.get_value() else: raise AnnotationOptionError(key)
# -----------------------------------------------------------------------
[docs] def load_resources(self, lang_resources, lang=None): """Load a list of stop-words and replacements. Override the existing loaded lists... :param lang_resources: (str) File with extension '.stp' or '.lem' or nothing :param lang: (str) """ self._word_strain = sppasWordStrain() fn, fe = os.path.splitext(lang_resources) try: stp = fn + '.stp' self._stop_words.load(stp, merge=False) self.logfile.print_message( "The initial list contains {:d} stop-words" "".format(len(self._stop_words)), indent=0) except Exception as e: self._stop_words.clear() self.logfile.print_message( "No stop-words loaded: {:s}".format(str(e)), indent=1) try: repl = fn + ".lem" if os.path.exists(repl): self._word_strain.load(repl) self.logfile.print_message( "The replacement list contains {:d} tokens" "".format(len(self._word_strain)), indent=0) except Exception as e: self._word_strain = sppasWordStrain() self.logfile.print_message( "No replacement list loaded: {:s}" "".format(str(e)), indent=1)
# ----------------------------------------------------------------------- # Getters and Setters # -----------------------------------------------------------------------
[docs] def set_use_stopwords(self, use_stopwords): """Fix the use_stopwords option. If use_stopwords is set to True, sppasRepetition() will add specific stopwords to the stopwords list (deducted from the input text). :param use_stopwords: (bool) """ self._options['stopwords'] = bool(use_stopwords)
# -----------------------------------------------------------------------
[docs] def set_span(self, span): """Fix the span option. Span is the maximum number of IPUs to search for repetitions. A value of 1 means to search only in the current IPU. :param span: (int) """ span = int(span) if 0 < span <= self.max_span: self._options['span'] = span else: raise IndexRangeException(span, 0, self.max_span)
# -----------------------------------------------------------------------
[docs] def set_alpha(self, alpha): """Fix the alpha option. Alpha is a coefficient to add specific stop-words in the list. :param alpha: (float) """ self._stop_words.set_alpha(alpha)
# ----------------------------------------------------------------------- # Make tiers for the result # -----------------------------------------------------------------------
[docs] def make_word_strain(self, tier): """Return a tier with modified tokens. :param tier: (sppasTier) Time-aligned tokens. """ if len(self._word_strain) == 0: return tier self.logfile.print_message("Words strain enabled.", indent=1, status=2) lems_tier = sppasTier('TokenStrain') for ann in tier: token = serialize_labels(ann.get_labels()) lem = self._word_strain.get(token, token) lems_tier.create_annotation( ann.get_location().copy(), sppasLabel(sppasTag(lem)) ) return lems_tier
# -----------------------------------------------------------------------
[docs] def make_stop_words(self, tier): """Return a tier indicating if entries are stop-words. :param tier: (sppasTier) Time-aligned tokens. """ stp_tier = sppasTier('StopWord') for ann in tier: token = serialize_labels(ann.get_labels()) if token not in symbols.all: stp = self._stop_words.is_in(token) stp_tier.create_annotation( ann.get_location().copy(), sppasLabel(sppasTag(stp, tag_type="bool")) ) return stp_tier