Source code for annotations.Phon.dagphon
# -*- coding: UTF-8 -*-
"""
:filename: sppas.src.annotations.Phon.dagphon.py
:author: Brigitte Bigi
:contact: develop@sppas.org
:summary: Direct Acyclic Graph for the phonetization of unknown entries.
.. _This file is part of SPPAS: http://www.sppas.org/
..
-------------------------------------------------------------------------
___ __ __ __ ___
/ | \ | \ | \ / the automatic
\__ |__/ |__/ |___| \__ annotation and
\ | | | | \ analysis
___/ | | | | ___/ of speech
Copyright (C) 2011-2021 Brigitte Bigi
Laboratoire Parole et Langage, Aix-en-Provence, France
Use of this software is governed by the GNU Public License, version 3.
SPPAS is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
SPPAS is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SPPAS. If not, see <http://www.gnu.org/licenses/>.
This banner notice must not be removed.
-------------------------------------------------------------------------
"""
import re
from sppas.src.config import separators
from sppas.src.structs.dag import DAG
# ----------------------------------------------------------------------------
[docs]class sppasDAGPhonetizer(object):
"""DAG to phonetize unk.
"""
[docs] def __init__(self, variants=4):
"""Create a sppasDAGPhonetizer instance.
:param variants: (int) Maximum number of variants for phonetizations.
"""
self.variants = 0
self.set_variants(variants)
# -----------------------------------------------------------------------
[docs] def set_variants(self, v):
"""Fix the maximum number of variants.
:param v: (int) If v is set to 0, all variants will be returned.
"""
if v < 0 or v > 20:
raise ValueError('Unexpected value for the number of variants.')
self.variants = v
# -----------------------------------------------------------------------
[docs] def phon2DAG(self, pron):
"""Convert a phonetization into a DAG.
:param pron:
"""
tabpron = pron.split()
graph = DAG() # the Graph: store segments and get all paths
prongraph = list() # the pronunciation of each segment
# A Start node (required if the 1st segment has variants)
graph.add_node(0)
prongraph.append("start")
# Init values
prec = 1
precv = 1
# Get all longest-segments of a token
for i in range(len(tabpron)):
variants = tabpron[i].split(separators.variants)
# Get all variants of this part-of-token
for v in range(len(variants)):
# store variants
prongraph.append(variants[v])
if i < len(tabpron):
graph.add_node(prec+v)
# add these variants to the preceding segments
for k in range(prec-precv, prec):
graph.add_edge(k, prec+v)
prec += len(variants)
precv = len(variants)
# add a "End" node
prongraph.append("end")
graph.add_node(prec)
for k in range(prec-precv, prec):
graph.add_edge(k, prec)
return graph, prongraph
# -----------------------------------------------------------------------
[docs] def DAG2phon(self, graph, pron_graph):
"""Convert a DAG into a dict, including all pronunciation variants.
:param graph:
:param pron_graph:
:returns:
"""
pathslist = graph.find_all_paths(0, len(graph)-1)
pron = dict()
for variant in pathslist:
p = ""
for i in variant[1:len(variant)-1]: # ignore Start and End nodes
p = p + separators.phonemes + pron_graph[i]
p = re.sub('^.', "", p)
pron[p] = len(p.split(separators.phonemes))
return pron
# -----------------------------------------------------------------------
[docs] def decompose(self, pron1, pron2=""):
"""Create a decomposed phonetization from a string as follow:
>>> self.decompose("p1 p2|x2 p3|x3")
>>> p1-p2-p3|p1-p2-x3|p1-x2-p3|p1-x2-x3
The input string is converted into a DAG, then output corresponds
to all paths.
"""
if len(pron1) == 0 and len(pron2) == 0:
return ""
# Complex phonetization: converted into a DAG
(graph1, prongraph1) = self.phon2DAG(pron1)
(graph2, prongraph2) = DAG(), list()
if len(pron2) > 0:
(graph2, prongraph2) = self.phon2DAG(pron2)
# Create all pronunciations from the DAG
pron1 = self.DAG2phon(graph1, prongraph1)
if len(pron2) > 0:
pron2 = self.DAG2phon(graph2, prongraph2)
else:
pron2 = dict()
# Merge =======>
# TODO: MERGE DAGs instead of merging prons
pron = dict()
pron.update(pron1)
pron.update(pron2)
# Output selection
v = separators.variants
# Return all variants
if self.variants == 0:
return v.join(pron.keys())
# Choose the shorter variants
if self.variants == 1:
return min(pron.items(), key=lambda x: x[1])[0]
# Other number of variants: choose shorters
ll = sorted(pron.items(), key=lambda x: x[1])[:self.variants]
return v.join(list(zip(*ll))[0])