Source code for annotations.TextNorm.orthotranscription

# -*- coding: UTF-8 -*-
"""
:filename: sppas.src.annotations.TextNorm.__init__.py
:author:   Brigitte Bigi
:contact:  develop@sppas.org
:summary:  Manage an enriched orthographic transcription.

.. _This file is part of SPPAS: http://www.sppas.org/
..
    -------------------------------------------------------------------------

     ___   __    __    __    ___
    /     |  \  |  \  |  \  /              the automatic
    \__   |__/  |__/  |___| \__             annotation and
       \  |     |     |   |    \             analysis
    ___/  |     |     |   | ___/              of speech

    Copyright (C) 2011-2021  Brigitte Bigi
    Laboratoire Parole et Langage, Aix-en-Provence, France

    Use of this software is governed by the GNU Public License, version 3.

    SPPAS is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    SPPAS is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with SPPAS. If not, see <http://www.gnu.org/licenses/>.

    This banner notice must not be removed.

    -------------------------------------------------------------------------

"""

import re
from unicodedata import category

from sppas.src.config import sppasUnicode
from sppas.src.config import u

# ---------------------------------------------------------------------------


[docs]class sppasOrthoTranscription(object): """Manager of an orthographic transcription. This is a totally language-independent class. It supports the orthographic transcription defined into SPPAS software tool. From the manual Enriched Orthographic Transcription, two derived ortho. transcriptions are generated automatically by the tokenizer: the "standard" transcription (the list of orthographic tokens); the "faked spelling" that is a specific transcription from which the obtained phonetic tokens are used by the phonetization system. The following illustrates an utterance text normalization in French: - Transcription: j'ai on a j'ai p- (en)fin j'ai trouvé l(e) meilleur moyen c'était d(e) [loger,locher] chez des amis (English translation is: I've we've I've - well I found the best way was to live in friends' apartment') - Result of the standard tokens: j' ai on a j' ai p- enfin j' ai trouvé le meilleur moyen c' était de loger chez des amis - Result of the faked tokens: j' ai on a j' ai p- fin j' ai trouvé l meilleur moyen c' était d loche chez des amis """
[docs] def __init__(self): pass
# -----------------------------------------------------------------------
[docs] @staticmethod def clean_toe(entry): """Clean Enriched Orthographic Transcription. The convention includes information that must be removed. :param entry: (str) :returns: (str) """ # Proper names: $ name ,P\$ entry = re.sub(u(',\s?[PTS]+\s?[\\/\\\]+\s?\\$'), r'', entry, re.UNICODE) entry = re.sub(u('\$'), r'', entry, re.UNICODE) # Tags of the activity entry = re.sub(u('(gpd_[0-9]+)'), r" ", entry, re.UNICODE) entry = re.sub(u('(gpf_[0-9]+)'), r" ", entry, re.UNICODE) entry = re.sub(u('(ipu_[0-9]+)'), r" ", entry, re.UNICODE) # Remove invalid parenthesis content entry = re.sub(u('\s+\([\w\xaa-\xff]+\)\s+'), ' ', entry, re.UNICODE) entry = re.sub(u('^\([\w\xaa-\xff]+\)\s+'), ' ', entry, re.UNICODE) entry = re.sub(u('\s+\([\w\xaa-\xff]+\)$'), ' ', entry, re.UNICODE) entry = re.sub(u('\s*\[([^,]+),([^,]+)\]'), sppasOrthoTranscription.__replace, entry, re.UNICODE) return " ".join(entry.split())
# -----------------------------------------------------------------------
[docs] @staticmethod def toe_spelling(entry, std=False): """Create a specific spelling from an Enriched Orthographic Transcription. :param entry: (str) the EOT string :param std: (bool) Standard spelling expected instead of the Faked one. :returns: (str) DevNote: Python’s regular expression engine supports Unicode. It can apply the same pattern to either 8-bit (encoded) or Unicode strings. To create a regular expression pattern that uses Unicode character classes for \w (and \s, and \b), use the “(?u)” flag prefix, or the re.UNICODE flag. """ # Ensure all regexp will work! _fentry = " " + u(entry) + " " if std is False: # Stick un-regular liaisons to the previous token _fentry = re.sub(u(' =([\w]+)='), r'-\1', _fentry, re.UNICODE) else: # Remove liaisons _fentry = re.sub(u(' =([\w]+)='), u(' '), _fentry, re.UNICODE) # Laughing sequences _fentry = re.sub(u("\s?@\s?@\s?"), u(' '), _fentry, re.UNICODE) # Laughter _fentry = re.sub(u("([\w\xaa-\xff]+)@"), u(r"\1 @"), _fentry, re.UNICODE) _fentry = re.sub(u("@([\w\xaa-\xff]+)"), u(r"@ \1"), _fentry, re.UNICODE) # Noises _fentry = re.sub(u("([\w\xaa-\xff]+)\*"), u(r"\1 *"), _fentry, re.UNICODE) _fentry = re.sub(u("\*([\w\xaa-\xff]+)"), u(r"* \1"), _fentry, re.UNICODE) # Transcriptor comment's: {comment} _fentry = re.sub(u('\\{[\s\w\xaa-\xff\-:]+\\}'), '', _fentry, re.UNICODE) # Transcriptor comment's: [comment] _fentry = re.sub(u('\\[[\s\w\xaa-\xff\-:]+\\]'), '', _fentry, re.UNICODE) if std is False: # Special elisions (remove parenthesis content) _fentry = re.sub(u('\\([\s\w\xaa-\xff\-\']+\\)'), '', _fentry, re.UNICODE) else: # Special elisions (keep parenthesis content) _fentry = re.sub(u('\\(([\s\w\xaa-\xff\-]+)\\)'), u(r'\1'), _fentry, re.UNICODE) # Morphological variants _fentry = re.sub(u('\s+\\<([\-\'\s\w\xaa-\xff]+),([\-\'\s\w\xaa-\xff]+)\\>'), u(r' {\1|\2}'), _fentry, re.UNICODE) # the following is removed from SPPAS 1.9.6. It probably corresponded to a corpus... # dont remember exactly! # _fentry = re.sub(u('\s+\\{([\-\'\s\w\xaa-\xff]+),[\-\'\s\w\xaa-\xff]+\\}'), u(r' \1'), _fentry, re.UNICODE) if std is False: # Special pronunciations (keep right part) _fentry = re.sub(u('\s+\\[([\s\w\xaa-\xff/-]+),([\s\w\xaa-\xff/]+)\\]'), u(r' \2'), _fentry, re.UNICODE) else: # Special pronunciations (keep left part) _fentry = re.sub(u('\s+\\[([\s\w\xaa-\xff\\/-]+),[\s\w\xaa-\xff\\/]+\\]'), u(r' \1'), _fentry, re.UNICODE) # Proper names: $ name ,P\$ _fentry = re.sub(u(',\s?[PTS]+\s?[\\/\\\]+\s?\\$'), '', _fentry, re.UNICODE) _fentry = re.sub(u('\\$'), '', _fentry, re.UNICODE) # specific case with numbers _fentry = re.sub(u("\s(?=,[0-9]+)"), '', _fentry, re.UNICODE) # ok, now stop regexp and work with unicode: _fentry = sppasUnicode(_fentry).to_strip() # Punctuations at the end of a token s = [] entries = _fentry.split() for i, c in enumerate(entries): is_trunc = c.endswith("-") # Check for the SAMPA sequence to assign properly "is_sampa" if c.startswith("/") and c.endswith('/'): is_sampa = True else: is_sampa = False # if not is_sampa, add a whitespace if some punctuations are stick to a word if is_sampa is False: # if there is a serie of punctuations at the beginning while len(c) > 0 and category(c[0])[0] in ('P', 'S'): s.append(c[0]) c = c[1:] # if there is a serie of punctuations at the end end_punct = [] if is_trunc is False: while len(c) > 0 and category(c[-1])[0] in ('P', 'S'): end_punct.append(c[-1]) c = c[:-1] if len(end_punct) == 1 and end_punct[0] == u("."): s.append(c+u(".")) else: s.append(c) if len(end_punct) > 0: s.extend(reversed(end_punct)) else: if len(s) == 0: s.append(c) else: s[-1] += c return " ".join(s)
# ----------------------------------------------------------------------- @staticmethod def __replace(obj): """Callback for clean_toe. :param obj: (MatchObject) :returns: (str) """ # Left part # Remove parentheses left = obj.group(1).replace('(', '') left = left.replace(')', '') # Replace spaces with underscores left = "_".join(left.split()) # Right part # Remove spaces right = obj.group(2) right = "".join(right.split()) return " [{:s},{:s}]".format(left, right)