Source code for annotations.Syll.syllabify

# -*- coding: UTF-8 -*-
"""
:filename: sppas.src.annotations.Syll.syllabify.py
:author:   Brigitte Bigi
:contact:  develop@sppas.org
:summary:  Syllabification of a sequence of phonemes.

.. _This file is part of SPPAS: http://www.sppas.org/
..
    -------------------------------------------------------------------------

     ___   __    __    __    ___
    /     |  \  |  \  |  \  /              the automatic
    \__   |__/  |__/  |___| \__             annotation and
       \  |     |     |   |    \             analysis
    ___/  |     |     |   | ___/              of speech

    Copyright (C) 2011-2021  Brigitte Bigi
    Laboratoire Parole et Langage, Aix-en-Provence, France

    Use of this software is governed by the GNU Public License, version 3.

    SPPAS is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    SPPAS is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with SPPAS. If not, see <http://www.gnu.org/licenses/>.

    This banner notice must not be removed.

    -------------------------------------------------------------------------

"""

from sppas.src.config import separators

from .rules import SyllRules

# ----------------------------------------------------------------------------


[docs]class Syllabifier(object): """Syllabification of a sequence of phonemes. """
[docs] def __init__(self, rules_filename=None): """Create a new Syllabifier instance. Load rules from a text file, depending on the language and phonemes encoding. See documentation for details about this file. :param rules_filename: (str) Name of the file with the list of rules. """ self.rules = SyllRules(rules_filename)
# -----------------------------------------------------------------------
[docs] def annotate(self, phonemes): """Return the syllable boundaries of a sequence of phonemes. >>> phonemes = ['a', 'p', 's', 'k', 'm', 'w', 'a'] >>> Syllabifier("fra-config-file").annotate(phonemes) >>> [(0, 3), (4, 6)] :param phonemes: (list) :returns: list of tuples (begin index, end index) """ # Convert a list of phonemes into a list of classes. classes = [self.rules.get_class(p) for p in phonemes] syllables = list() # Find the first vowel = first nucleus nucleus = Syllabifier._fix_nucleus(classes, 0) if nucleus == -1: return list() end_syll = -1 while nucleus != -1: start_syll = self._fix_start_syll(classes, end_syll, nucleus) next_nucleus = Syllabifier._find_next_vowel(classes, nucleus+1) next_break = Syllabifier._find_next_break(classes, nucleus) if next_break != -1 and \ (next_break < next_nucleus or next_nucleus == -1): # no rule to apply if the next event is a break. # ie next break occurs before next nucleus or # no next nucleus syllables.append((start_syll, next_break-1)) elif next_break == -1 and next_nucleus == -1: # no rule to apply if current nucleus concerns # the last syllable end_syll = len(phonemes) - 1 syllables.append((start_syll, end_syll)) else: # apply the exception rule or the general one end_syll = self._apply_class_rules(classes, nucleus, next_nucleus) # apply the specific rules on phonemes to shift the end end_syll = self._apply_phon_rules(phonemes, end_syll, nucleus, next_nucleus) syllables.append((start_syll, end_syll)) nucleus = next_nucleus return syllables
# -----------------------------------------------------------------------
[docs] @staticmethod def phonetize_syllables(phonemes, syllables): """Return the phonetized sequence of syllables. >>> phonemes = ['a', 'p', 's', 'k', 'm', 'w', 'a'] >>> syllables = Syllabifier("fra-config-file").annotate(phonemes) >>> Syllabifier.phonetize_syllables(phonemes, syllables) >>> "a-p-s-k.m-w-a" :param phonemes: (list) List of phonemes :param syllables: list of tuples (begin index, end index) :returns: (str) String representing the syllables segmentation """ str_syll = list() for (begin, end) in syllables: str_syll.append(separators.phonemes.join(phonemes[begin:end+1])) return separators.syllables.join(str_syll)
# -----------------------------------------------------------------------
[docs] def classes_phonetized(self, phonetized_syllable): """Return the classes of a phonetized syllable. >>> syllable = "a-p-s-k" >>> syllabifier.classes_phonetized(syllable) >>> "V-P-F-P" """ c = list() for p in phonetized_syllable.split(separators.phonemes): c.append(self.rules.get_class(p)) return separators.phonemes.join(c)
# ----------------------------------------------------------------------- # Private # ----------------------------------------------------------------------- @staticmethod def _fix_nucleus(classes, from_index): """Search for the next nucleus of a syllable.""" next_nucleus = -1 next_break = -1 while next_break <= next_nucleus: next_nucleus = Syllabifier._find_next_vowel(classes, from_index) next_break = Syllabifier._find_next_break(classes, from_index) if next_nucleus == -1: return -1 if next_break == -1: return next_nucleus from_index = next_nucleus return next_nucleus # ----------------------------------------------------------------------- @staticmethod def _fix_start_syll(classes, end_previous, nucleus): """Search for the index of the first phoneme of the syllable.""" # should not occur if end_previous == nucleus: return nucleus for i in reversed(range(end_previous, nucleus)): if i == -1: return 0 if classes[i] in ("V", "W", SyllRules.BREAK_SYMBOL): return i+1 # no break nor vowel between the end of the previous syllable # and the current nucleus return end_previous+1 # ----------------------------------------------------------------------- @staticmethod def _find_next_vowel(classes, from_index): """Find the index of the next vowel. -1 is returned if no longer vowel is existing. :param classes: (list) List of phoneme classes :param from_index: (int) the position where the search will begin (this from index is included in). :returns: the position of the next vowel or -1 """ for i in range(from_index, len(classes)): if classes[i] in ("V", "W"): return i return -1 # ----------------------------------------------------------------------- @staticmethod def _find_next_break(classes, from_index): """Find the index of the next break. -1 is returned if no longer break is existing. :param classes: (list) List of phoneme classes :param from_index: (int) the position where the search will begin :returns: the position of the next break or -1 """ for i in range(from_index, len(classes)): if classes[i] == SyllRules.BREAK_SYMBOL: return i return -1 # ----------------------------------------------------------------------- def _apply_class_rules(self, classes, v1, v2): """Apply the syllabification rules between v1 and v2.""" sequence = "".join(classes[v1:v2+1]) return v1 + self.rules.get_class_rules_boundary(sequence) # ----------------------------------------------------------------------- def _apply_phon_rules(self, phonemes, end_syll, v1, v2): """Apply the specific phoneme-based syllabification rules. Applied between v1 and v2. """ _str = "" nb = v2-v1 if nb > 1: # specific rules are sequences of 5 consonants max if nb == 5: _str = "V " if nb < 5: _str = "ANY "*(5-nb) + "V " for i in range(1, nb): _str = _str + phonemes[v1+i] + " " _str = _str.strip() if len(_str) > 0: d = self.rules.get_gap(_str) if d != 0: # check validity before assigning... new_end = end_syll + d if v2 >= new_end >= v1: end_syll = new_end return end_syll