Source code for annotations.SelfRepet.detectrepet
"""
:filename: sppas.src.annotations.SelfRepet.detectrepet.py
:author: Brigitte Bigi
:contact: develop@sppas.org
:summary: Detect Self-Repetition of a speaker.
.. _This file is part of SPPAS: http://www.sppas.org/
..
-------------------------------------------------------------------------
___ __ __ __ ___
/ | \ | \ | \ / the automatic
\__ |__/ |__/ |___| \__ annotation and
\ | | | | \ analysis
___/ | | | | ___/ of speech
Copyright (C) 2011-2021 Brigitte Bigi
Laboratoire Parole et Langage, Aix-en-Provence, France
Use of this software is governed by the GNU Public License, version 3.
SPPAS is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
SPPAS is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SPPAS. If not, see <http://www.gnu.org/licenses/>.
This banner notice must not be removed.
-------------------------------------------------------------------------
"""
from .datastructs import DataRepetition
from .rules import SelfRules
# ----------------------------------------------------------------------------
[docs]class SelfRepetition(DataRepetition):
"""Self-Repetition automatic detection.
Search for the sources, then find where are the echos.
"""
[docs] def __init__(self, stop_list=None):
"""Create a new SelfRepetitions instance.
:param stop_list: (StopWords) List of un-relevant tokens.
"""
super(SelfRepetition, self).__init__()
self.__rules = SelfRules(stop_list)
# -----------------------------------------------------------------------
# Detect sources
# -----------------------------------------------------------------------
[docs] def detect(self, speaker, limit=10):
"""Search for the first self-repetition in tokens.
:param speaker: (DataSpeaker) All the data of speaker
:param limit: (int) Go no longer than 'limit' entries in speaker data
"""
self.reset()
current_spk = 0
next_spk = self.get_longest(current_spk, speaker)
# Stop for searching if end of the data or self-repet found
while current_spk < len(speaker) and current_spk < limit and \
self.get_source() is None:
if next_spk == -1:
current_spk += 1
else:
current_spk = self.select(current_spk, next_spk, speaker)
next_spk = SelfRepetition.get_longest(current_spk, speaker)
# -----------------------------------------------------------------------
[docs] @staticmethod
def get_longest(current, speaker):
"""Return the index of the last token of the longest repeated string.
:param current: (int) Current index in entries of speaker data
:param speaker: (DataSpeaker) All the data of speaker
:returns: (int) Index or -1
"""
last_token = -1
# Get the longest string
for current_token in range(current, len(speaker)):
next_word = speaker.get_next_word(current_token)
repet_idx = speaker.is_word_repeated(current_token,
next_word, speaker)
if repet_idx > -1:
if repet_idx == current_token:
return current_token
last_token = current_token
else:
break
return last_token
# -----------------------------------------------------------------------
[docs] def select(self, start, end, speaker):
"""Append (or not) a self-repetition.
:param start: (int) start index of the entry of the source (speaker)
:param end: (int) end index of the entry of the source (speaker)
:param speaker: (DataSpeaker) Entries of speaker
"""
source_len = end - start
if source_len == 0:
keep_me = self.__rules.rule_one_token(start, speaker)
if keep_me is True:
self.set_source(start, start)
self.find_echos(start, start, speaker)
current = start + 1
else:
keep_me = self.__rules.rule_syntagme(start, end, speaker)
if keep_me is True:
self.set_source(start, end)
self.find_echos(start, end, speaker)
current = end + 1
return current
# -----------------------------------------------------------------------
# Search for echos (for a given source)
# -----------------------------------------------------------------------
[docs] def find_echos(self, start, end, speaker):
"""Find all echos of a source.
:param start: (int) start index of the entry of the source (speaker)
:param end: (int) end index of the entry of the source (speaker)
:param speaker: (DataSpeaker) All data of speaker
:returns: DataRepetition()
"""
# Find all repeated tokens of each token of the source
repeats = list()
ridx = 0
i = start
while i <= end:
repeats.append(list())
idx2 = speaker.is_word_repeated(i, end+1, speaker)
while idx2 != -1:
repeats[ridx].append(idx2)
idx2 = speaker.is_word_repeated(i, idx2+1, speaker)
i += 1
ridx += 1
# Filter the repetitions (try to get the longest sequence)
if len(repeats) == 1:
self.add_echo(repeats[0][0], repeats[0][0])
else:
i = 0
while i < len(repeats):
repeated = SelfRepetition.__get_longest_repeated(i, repeats)
self.add_echo(repeated[0], repeated[-1])
i += len(repeated)
# -----------------------------------------------------------------------
@staticmethod
def __get_longest_repeated(start, repeats):
"""Select the longest echo from start position in repeats."""
path_repeats = []
for i in range(len(repeats[start])):
path_repeats.append([])
path_repeats[i].append(repeats[start][i])
for j in range(start+1, len(repeats)):
prec_value = path_repeats[-1][-1]
v = 0
if prec_value not in repeats[j]:
if (prec_value+1) not in repeats[j]:
if (prec_value+2) not in repeats[j]:
if (prec_value-1) not in repeats[j]:
break
else:
v = repeats[j].index(prec_value-1)
else:
v = repeats[j].index(prec_value+2)
else:
v = repeats[j].index(prec_value+1)
else:
v = repeats[j].index(prec_value)
path_repeats[i].append(repeats[j][v])
# return the (first of the) longest path:
return sorted(max(path_repeats, key=lambda x: len(x)))