Source code for annotations.SelfRepet.datastructs

"""
:filename: sppas.src.annotations.SelfRepet.datastructs.py
:author:   Brigitte Bigi
:contact:  develop@sppas.org
:summary: Data structure to store a source and its echos.

.. _This file is part of SPPAS: http://www.sppas.org/
..
    -------------------------------------------------------------------------

     ___   __    __    __    ___
    /     |  \  |  \  |  \  /              the automatic
    \__   |__/  |__/  |___| \__             annotation and
       \  |     |     |   |    \             analysis
    ___/  |     |     |   | ___/              of speech

    Copyright (C) 2011-2021  Brigitte Bigi
    Laboratoire Parole et Langage, Aix-en-Provence, France

    Use of this software is governed by the GNU Public License, version 3.

    SPPAS is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    SPPAS is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with SPPAS. If not, see <http://www.gnu.org/licenses/>.

    This banner notice must not be removed.

    -------------------------------------------------------------------------

"""

import re

from sppas.src.config import symbols
from sppas.src.config import RangeBoundsException
from sppas.src.config import IndexRangeException
from sppas.src.config import sppasUnicode

# ---------------------------------------------------------------------------


[docs]class DataRepetition(object): """Class to store one repetition (the source and the echos). The source of a repetition is represented as a tuple (start, end). The echos of this latter are stored as a list of tuples (start, end). """
[docs] def __init__(self, s1=None, s2=None, r1=None, r2=None): """Create a DataRepetition data structure. :param s1: start position of the source. :param s2: end position of the source. :param r1: start position of an echo :param r2: end position of an echo """ self.__source = None self.set_source(s1, s2) self.__echos = list() if r1 is not None and r2 is not None: self.add_echo(r1, r2)
# -----------------------------------------------------------------------
[docs] def reset(self): """Fix the source to None and the echos to an empty list.""" self.__source = None self.__echos = list()
# -----------------------------------------------------------------------
[docs] def set_source(self, start, end): """Set the position of the source. Setting the position of the source automatically resets the echos because it's not correct to change the source of existing echos. :param start: Start position of the source :param end: End position of the source :raises: ValueError, IndexError """ if start is None or end is None: self.reset() return s1 = int(start) s2 = int(end) if s1 > s2: raise RangeBoundsException(s1, s2) if s1 < 0 or s2 < 0: raise ValueError self.__source = (s1, s2) self.__echos = list()
# -----------------------------------------------------------------------
[docs] def get_source(self): """Return the tuple (start, end) of the source.""" return self.__source
# -----------------------------------------------------------------------
[docs] def get_echos(self): """Return the list of echos.""" return self.__echos
# -----------------------------------------------------------------------
[docs] def add_echo(self, start, end): """Add an entry in the list of echos. :param start: Start position of the echo. :param end: End position of the source. :raises: ValueError """ if self.__source is None: raise Exception('No source defined.') if start is None or end is None: return 0 r1 = int(start) r2 = int(end) if r1 > r2: raise RangeBoundsException(r1, r2) if r1 < 0 or r2 < 0: raise ValueError if (r1, r2) not in self.__echos: self.__echos.append((r1, r2)) return 1 return 0
# ----------------------------------------------------------------------- # Overloads # ----------------------------------------------------------------------- def __str__(self): print("source: ({:d}, {:d})" "".format(self.__source[0], self.__source[1])) print("echos: ") for rep in self.__echos: print(" ({:d}, {:d}) ".format(rep[0], rep[1]))
# ---------------------------------------------------------------------------
[docs]class Entry(object): """Class to store a formatted unicode entry. """
[docs] def __init__(self, entry): """Create an Entry instance. :param entry: (str, unicode) """ self.__entry = None self.set(entry)
# -----------------------------------------------------------------------
[docs] def get(self): """Return the formatted unicode entry.""" return self.__entry
# -----------------------------------------------------------------------
[docs] def set(self, entry): """Fix the entry. :param entry: (str, unicode) entry to store. """ if entry is None: self.__entry = sppasUnicode("").to_strip() else: self.__entry = sppasUnicode(entry).to_strip() self.__clean()
# ----------------------------------------------------------------------- # Private # ----------------------------------------------------------------------- def __clean(self): """Remove some punctuations (they can be due to the EOT).""" self.__entry = re.sub("\~$", "", self.__entry) self.__entry = re.sub("\-+$", "", self.__entry) self.__entry = re.sub(">$", "", self.__entry) self.__entry = re.sub("^<", "", self.__entry)
# ---------------------------------------------------------------------------
[docs]class DataSpeaker(object): """Class to store data of a speaker. Stored data are a list of formatted unicode strings. """
[docs] def __init__(self, tokens): """Create a DataSpeaker instance. :param tokens: (list) List of tokens. """ self.__entries = list() for tok in tokens: self.__entries.append(Entry(tok).get())
# -----------------------------------------------------------------------
[docs] def is_word(self, idx): """Return true if the entry at the given index is a word. An empty entry is not a word. Symbols (silences, laughs...) are not words. Hesitations are considered words. Return False if the given index is wrong. :param idx: (int) Index of the entry to get :returns: (bool) """ if idx < 0: return False if idx >= len(self.__entries): return False # An empty string if len(self.__entries[idx]) == 0: return False # Symbols used by SPPAS to represent an event if self.__entries[idx] in symbols.all: return False return True
# -----------------------------------------------------------------------
[docs] def get_next_word(self, current): """Ask for the index of the next word in entries. :param current (int) Current position to search for the next word :returns: (int) Index of the next word or -1 if no next word can be found. """ # check if current is a correct value self.__get_entry(current) # search for the next word after the current index c_next = current + 1 while c_next < len(self.__entries): if self.is_word(c_next) is True: return c_next c_next += 1 return -1
# -----------------------------------------------------------------------
[docs] def is_word_repeated(self, current, other_current, other_speaker): """Ask for a token to be a repeated word. :param current: (int) From index, in current speaker :param other_current: (int) From index, in the other speaker :param other_speaker: (DataSpeaker) Data of the other speaker :returns: index of the echo or -1 """ # Does the current entry is a word? if self.is_word(current) is False: return -1 # Search for this word in the other speaker data word = self.__entries[current] while 0 <= other_current < len(other_speaker): other_token = other_speaker[other_current] if word == other_token: return other_current # not found. try next one other_current = other_speaker.get_next_word(other_current) return -1
# ----------------------------------------------------------------------- # Private # ----------------------------------------------------------------------- def __get_entry(self, idx): """Return the formatted "token" at the given index. Raise exception if index is wrong. :param idx: (int) Index of the entry to get :returns: (str) unicode formatted entry """ if idx < 0: raise IndexRangeException(idx, 0, len(self.__entries)) if idx >= len(self.__entries): raise IndexRangeException(idx, 0, len(self.__entries)) return self.__entries[idx] # ----------------------------------------------------------------------- # Overloads # ----------------------------------------------------------------------- def __str__(self): return " ".join([e for e in self.__entries]) def __iter__(self): for a in self.__entries: yield a def __getitem__(self, i): return self.__get_entry(i) def __len__(self): return len(self.__entries)