Source code for annotations.SelfRepet.rules
"""
:filename: sppas.src.annotations.SelfRepet.__init__.py
:author: Brigitte Bigi
:contact: develop@sppas.org
:summary: Self-Repetition rules to accept/reject a candidate.
.. _This file is part of SPPAS: http://www.sppas.org/
..
-------------------------------------------------------------------------
___ __ __ __ ___
/ | \ | \ | \ / the automatic
\__ |__/ |__/ |___| \__ annotation and
\ | | | | \ analysis
___/ | | | | ___/ of speech
Copyright (C) 2011-2021 Brigitte Bigi
Laboratoire Parole et Langage, Aix-en-Provence, France
Use of this software is governed by the GNU Public License, version 3.
SPPAS is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
SPPAS is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SPPAS. If not, see <http://www.gnu.org/licenses/>.
This banner notice must not be removed.
-------------------------------------------------------------------------
"""
from sppas.src.resources.vocab import sppasVocabulary
# ----------------------------------------------------------------------------
[docs]class SelfRules(object):
"""Rules to select self-repetitions.
Proposed rules deal with the number of words, the word frequencies and
distinguishes if the repetition is strict or not. The following rules are
proposed for other-repetitions:
- Rule 1: A source is accepted if it contains one or more relevant
token. Relevance depends on the speaker producing the echo;
- Rule 2: A source which contains at least K tokens is accepted
if the repetition is strict.
Rule number 1 need to fix a clear definition of the relevance of a
token. Un-relevant tokens are then stored in a stop-list.
The stop-list also should contain very frequent tokens in the given
language like adjectives, pronouns, etc.
"""
[docs] def __init__(self, stop_list=None):
"""Create a SelfRules instance.
:param stop_list: (sppasVocabulary or list) Un-relevant tokens.
"""
self.__stoplist = sppasVocabulary()
if stop_list is not None:
if isinstance(stop_list, sppasVocabulary):
self.__stoplist = stop_list
else:
for token in stop_list:
self.__stoplist.add(token)
# -----------------------------------------------------------------------
[docs] def is_relevant(self, idx, speaker):
"""Ask for the entry of a speaker to be relevant or not.
An entry is considered relevant if:
1. It is not a silence, a pause, a laugh, dummy or a noise;
2. It is not in the stop-list.
:param idx: (str) Index of the data to be checked
:param speaker: (DataSpeaker) All the data
:returns: (bool)
"""
word = speaker.is_word(idx)
not_stop_word = self.__stoplist.is_unk(speaker[idx])
return word and not_stop_word
# -----------------------------------------------------------------------
[docs] def count_relevant_tokens(self, start, end, speaker):
"""Count the number of relevant words from start to end (included).
:param start: (int) Index to start to count
:param end: (int) Index to stop to count
:param speaker: (DataSpeaker) All the data
:returns: (int)
"""
return len([True for i in range(start, end + 1)
if self.is_relevant(i, speaker)])
# -----------------------------------------------------------------------
[docs] def rule_one_token(self, current, speaker):
"""Check whether one token is a self-repetition or not.
Rules are:
- the token must be a word, and not in the stop-list;
- the token must be repeated.
:param current: (int) Index of the token to check
:param speaker: (DataSpeaker) All the data
:returns: (bool)
"""
# is it a relevant token?
is_relevant = self.is_relevant(current, speaker)
if is_relevant is False:
return False
# is it a repeated word?
next_word = speaker.get_next_word(current)
is_repeated = speaker.is_word_repeated(current, next_word, speaker)
if is_repeated == -1:
return False
return True
# -----------------------------------------------------------------------
[docs] def rule_syntagme(self, start, end, speaker):
"""Apply rule 1 to decide if selection is a repetition or not.
Rule 1: The selection of tokens of speaker 1 must contain at least
one relevant token for speaker 2.
:param start: (int) Index to start the selection
:param end: (int) Index to stop the selection
:param speaker: (DataSpeaker) All the data
:returns: (bool)
"""
return self.count_relevant_tokens(start, end, speaker) > 0