Source code for annotations.Phon.phonunk

# -*- coding: UTF-8 -*-
"""
:filename: sppas.src.annotations.Phon.phonunk.py
:author:   Brigitte Bigi
:contact:  develop@sppas.org
:summary:  Phonetization of an unknown entry.

.. _This file is part of SPPAS: http://www.sppas.org/
..
    -------------------------------------------------------------------------

     ___   __    __    __    ___
    /     |  \  |  \  |  \  /              the automatic
    \__   |__/  |__/  |___| \__             annotation and
       \  |     |     |   |    \             analysis
    ___/  |     |     |   | ___/              of speech

    Copyright (C) 2011-2021  Brigitte Bigi
    Laboratoire Parole et Langage, Aix-en-Provence, France

    Use of this software is governed by the GNU Public License, version 3.

    SPPAS is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    SPPAS is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with SPPAS. If not, see <http://www.gnu.org/licenses/>.

    This banner notice must not be removed.

    -------------------------------------------------------------------------

"""

import re

from sppas.src.config import sppasUnicode

from .dagphon import sppasDAGPhonetizer

# ---------------------------------------------------------------------------

LIMIT_SIZE = 40  # Max nb of characters of an unknown entry

# ---------------------------------------------------------------------------


[docs]class sppasPhonUnk(object): """Perform a dictionary-based phonetization for unknown entries. Implements a language-independent algorithm to phonetize unknown tokens. The algorithm is based on the idea that given enough examples it should be possible to predict the pronunciation of unseen tokens purely by analogy. It consists in exploring the unknown token from left to right, then from right to left, and to find the longest strings in the dictionary. Since this algorithm uses the dictionary, the quality of such a phonetization strongly depends on this resource. Example of use: >>> d = { 'a':'a|aa', 'b':'b', 'c':'c|cc', 'abb':'abb', 'bac':'bac' } >>> p = sppasPhonUnk(d) """
[docs] def __init__(self, pron_dict): """Create a sppasPhonUnk instance. :param pron_dict: (sppasPronDict) Dictionary of a set of tuples: token=key, phon=value. """ self.prondict = pron_dict self.dagphon = sppasDAGPhonetizer(variants=4)
# ------------------------------------------------------------------ # Getters and Setters # ------------------------------------------------------------------
[docs] def set_variants(self, v): """Fix the maximum number of variants. :param v: (int) If v is set to 0, all variants will be returned. """ self.dagphon.set_variants(v)
# -----------------------------------------------------------------------
[docs] def get_phon(self, entry): """Return the phonetization of an unknown entry. :param entry: (str) the string to phonetize :returns: a string with the proposed phonetization :raises: Exception if the word can NOT be phonetized """ _str = sppasUnicode(entry).to_strip() _str = sppasUnicode(_str).to_lower() if len(_str) > 0 and _str[-1].isalnum() is False: _str = _str[:-1] if len(_str) > 0 and _str[0].isalnum() is False: _str = _str[1:] if len(_str) == 0: return "" if len(entry) > LIMIT_SIZE: raise Exception # Find all pronunciations of segments with a longest matching algo. _tabstr = re.split("[-'_\s]", _str) pronlr = "" pronrl = "" for s in _tabstr: plr = self.__recurslr(s) plr = plr.strip() if len(plr) > 0: pronlr = pronlr + " " + plr prl = self.__recursrl(s) prl = prl.strip() if len(prl) > 0: pronrl = pronrl + " " + prl pronlr = pronlr.strip() pronrl = pronrl.strip() # Create the output pron = "" if len(pronlr) > 0: if len(pronrl) > 0: pron = self.dagphon.decompose(pronlr, pronrl) else: pron = self.dagphon.decompose(pronlr) else: if len(pronrl) > 0: pron = self.dagphon.decompose(pronrl) if len(pron) > 0: return pron raise Exception
# ----------------------------------------------------------------------- # Private # ----------------------------------------------------------------------- def __longestlr(self, entry): """Select the longest phonetization of an entry, from the end. :param entry: (str) """ i = len(entry) while i > 0: # Find in the dictionary a substring from 0 to i if entry[:i] in self.prondict: # Return index for the longest string return i i -= 1 # Did not find any pronunciation for this entry! return 0 # ----------------------------------------------------------------------- def __recurslr(self, entry): """Recursive method to find a phonetization of a supposed unk entry. Return a string with the proposed phonetization. Whitespace separate segments. """ if len(entry) == 0: return "" # LEFT: # ########### # Find the index of the longest left string that can be phonetized left = "" left_index = self.__longestlr(entry) # Nothing can be phonetized at the left part! if left_index == 0: _phonleft = "" left_index = 1 left = "" else: # left is from the first to the leftindex character in str left = entry[:left_index] # Phonetize _phonleft = self.prondict.get(left, "") # The entire entry can be phonetized (nothing to do at right) if left_index == len(entry): return _phonleft # RIGHT: # ########### right = entry[left_index:len(entry)] if len(right) == 0: return _phonleft if right in self.prondict: _phonright = self.prondict.get(right) else: # If right part of the entry is unknown... # Use recursivity to phonetize _phonright = self.__recurslr(right) if len(_phonleft) > 0 and len(_phonright) > 0: return _phonleft + " " + _phonright return _phonright # ----------------------------------------------------------------------- def __longestrl(self, entry): """Select the longest phonetization of an entry, from the start.""" i = 0 while i < len(entry): # Find in the dictionary a substring from i to the entry-length if entry[i:] in self.prondict: # Return index for the longest string return i i += 1 # Did not find any pronunciation for this entry! return len(entry) # ----------------------------------------------------------------------- def __recursrl(self, enrty): """Recursive method to find a phonetization of a supposed unk entry. Return a string with the proposed phonetization. Whitespace separate segments. """ if len(enrty) == 0: return "" # RIGHT: # ########### # Find the index of the longest right string that can be phonetized right = "" right_index = self.__longestrl(enrty) # Nothing can be phonetized at the right part! if right_index == len(enrty): _phonright = "" right_index = len(enrty)-1 right = "" else: # right is from the end to the rightindex character in str right = enrty[right_index:] _phonright = self.prondict.get(right, "") # The entire entry can be phonetized (nothing to do at left) if right_index == 0: return _phonright # LEFT: # ########### left = enrty[0:right_index] if len(left) == 0: return _phonright if left in self.prondict: _phonleft = self.prondict.get(left) else: # If left part of the entry is unknown... # Use recursivity to phonetize _phonleft = self.__recursrl(left) if len(_phonleft) > 0 and len(_phonright) > 0: return _phonleft + " " + _phonright return _phonleft