Source code for annotations.TextNorm.normalize

# -*- coding: UTF-8 -*-
"""
:filename: sppas.src.annotations.TextNorm.normalize.py
:author:   Brigitte Bigi
:contact:  develop@sppas.org
:summary:  Multilingual Text Normalization of an utterance.

.. _This file is part of SPPAS: http://www.sppas.org/
..
    -------------------------------------------------------------------------

     ___   __    __    __    ___
    /     |  \  |  \  |  \  /              the automatic
    \__   |__/  |__/  |___| \__             annotation and
       \  |     |     |   |    \             analysis
    ___/  |     |     |   | ___/              of speech

    Copyright (C) 2011-2021  Brigitte Bigi
    Laboratoire Parole et Langage, Aix-en-Provence, France

    Use of this software is governed by the GNU Public License, version 3.

    SPPAS is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    SPPAS is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with SPPAS. If not, see <http://www.gnu.org/licenses/>.

    This banner notice must not be removed.

    -------------------------------------------------------------------------

"""

import re
import logging

from sppas.src.config import sppasUnicode, u
from sppas.src.resources.vocab import sppasVocabulary
from sppas.src.resources.dictrepl import sppasDictRepl

from .orthotranscription import sppasOrthoTranscription
from .tokenize import sppasTokenSegmenter
from .language import sppasLangISO
from .splitter import sppasSimpleSplitter
from .num2text import sppasNumConstructor

# ---------------------------------------------------------------------------


[docs]class DictReplUTF8(sppasDictRepl): """Replacement dictionary of UTF8 characters that caused problems. This is a hack to match with our dictionaries... TODO: This class should read an external replacement file... """
[docs] def __init__(self): super(DictReplUTF8, self).__init__(None, nodump=True) self.add(u("æ"), u("ae")) self.add(u("œ"), u("oe")) self.add(u(","), u(", ")) self.add(u("”"), u('"')) self.add(u("“"), u('"')) self.add(u("。"), u(". ")) self.add(u("》"), u('"')) self.add(u("《"), u('"')) self.add(u("«"), u('"')) self.add(u("»"), u('"')) self.add(u("’"), u("'"))
# ---------------------------------------------------------------------------
[docs]class TextNormalizer(object): """Multilingual text normalization """
[docs] def __init__(self, vocab=None, lang="und"): """Create a TextNormalizer instance. :param vocab: (sppasVocabulary) :param lang: the language code in iso639-3. """ # resources self.dicoutf = DictReplUTF8() self.repl = sppasDictRepl(None) self.punct = sppasVocabulary() self.vocab = vocab if vocab is None: self.vocab = sppasVocabulary() self.num_dict = sppasDictRepl(None) # members self.lang = lang self.delimiter = ' '
# -----------------------------------------------------------------------
[docs] def get_vocab_filename(self): return self.vocab.get_filename()
# ----------------------------------------------------------------------- # Options # -----------------------------------------------------------------------
[docs] def set_delim(self, delim): """Set the delimiter, used to separate tokens. :param delim: (str) a unicode character. """ self.delimiter = delim
# -----------------------------------------------------------------------
[docs] def set_vocab(self, vocab): """Set the lexicon. :param vocab: (sppasVocabulary). """ # TODO: test instance self.vocab = vocab
# -----------------------------------------------------------------------
[docs] def set_repl(self, repl): """Set the dictionary of replacements. :param repl: (sppasDictRepl) """ # TODO: test instance self.repl = repl
# -----------------------------------------------------------------------
[docs] def set_punct(self, punct): """Set the list of punctuation. :param punct: (sppasVocabulary) """ # TODO: test instance self.punct = punct
# -----------------------------------------------------------------------
[docs] def set_lang(self, lang): """Set the language. :param lang: (str) the language code in iso639-3 (fra, eng, vie...). """ self.lang = lang
# -----------------------------------------------------------------------
[docs] def set_num(self, num_dict): """Set the dictionary of numbers. :param num_dict: (sppasDictRepl) """ self.num_dict = num_dict try: sppasNumConstructor.construct(self.lang, self.num_dict) logging.info('Conversion of numbers enabled for language {:s}' ''.format(self.lang)) except Exception as e: logging.error('Conversion of numbers will be disabled due to the ' 'following error: {:s}'.format(str(e)))
# ----------------------------------------------------------------------- # Language independent modules (or not!) # -----------------------------------------------------------------------
[docs] def replace(self, utt): """Examine tokens and performs some replacements. A dictionary with symbols contains the replacements to operate. :param utt: (list) the utterance :returns: A list of strings """ # Specific case of float numbers sent = ' '.join(utt) sent = re.sub(u('([0-9])\.([0-9])'), u(r'\1 NUMBER_SEP_POINT \2'), sent) sent = re.sub(u('([0-9])\,([0-9])'), u(r'\1 NUMBER_SEP \2'), sent) sent = sppasUnicode(sent).to_strip() _utt = sent.split() # Other generic replacements _result = list() for s in _utt: if self.repl.is_key(s): s = s.replace(s, self.repl.replace(s)) _result.append(sppasUnicode(s).to_strip()) return _result
# -----------------------------------------------------------------------
[docs] def tokenize(self, utt): """Tokenize is the text segmentation, ie segment into tokens. :param utt: (list) :returns: (list) """ tok = sppasTokenSegmenter(self.vocab) # rules for - ' . unbind_result = tok.unbind(utt) # longest matching for whitespace if sppasLangISO.without_whitespace(self.lang): tok.set_separator("") tok.set_aggregate_max(15) bind_result = tok.bind(unbind_result) # restore default values to the tokenizer tok.set_aggregate_max() tok.set_separator() return bind_result
# -----------------------------------------------------------------------
[docs] def numbers(self, utt): """Convert numbers to their written form. :param utt: (list) :returns: (list) """ try: num2letter = sppasNumConstructor.construct(self.lang, self.num_dict) except: return utt try: _result = list() for token in utt: if token.isdigit(): _result.append(num2letter.convert(token)) else: _result.append(token) return _result except Exception as e: logging.error('Conversion of numbers disabled due to the ' 'following error: {:s}'.format(str(e))) return utt
# -----------------------------------------------------------------------
[docs] def lower(self, utt): """Lower a list of strings. :param utt: (list) """ _utt = list() for tok in utt: # if it's not an already phonetized string: if "/" not in tok: _utt.append(sppasUnicode(tok).to_lower()) else: _utt.append(tok) return _utt
# -----------------------------------------------------------------------
[docs] def remove(self, utt, wlist): """Remove data of an utterance if included in a dictionary. Only used to remove punctuation. :param utt: (list) :param wlist: (WordList) """ _utt = [] for tok in utt: tok = sppasUnicode(tok).to_strip() if wlist.is_unk(tok) is True \ and "gpd_" not in tok \ and "ipu_" not in tok: _utt.append(tok) return _utt
# ----------------------------------------------------------------------- # The main normalizer is HERE! # -----------------------------------------------------------------------
[docs] def normalize(self, entry, actions=[]): """Tokenize an utterance. :param entry: (str) the string to normalize :param actions: (list) the modules/options to enable. - "std": generated the standard orthography instead of the faked one - "replace": use a replacement dictionary - "tokenize": tokenize the entry - "numbers": convert numbers to their written form - "lower": change case of characters to lower - "punct": remove punctuation :returns: (str) the list of normalized tokens Important: An empty actions list or a list containing only "std" means to enable all actions. """ _str = sppasUnicode(entry).to_strip() # Remove UTF-8 specific characters that are not in our dictionaries! for key in self.dicoutf: _str = _str.replace(key, self.dicoutf.replace(key)) # Clean the Enriched Orthographic Transcription ortho = sppasOrthoTranscription() _str = ortho.clean_toe(_str) if "std" in actions: _str = ortho.toe_spelling(_str, True) else: _str = ortho.toe_spelling(_str, False) # Split using whitespace or characters. splitter = sppasSimpleSplitter(self.lang, self.repl) utt = splitter.split(_str) # The entry is now a list of strings on which we'll perform actions # ----------------------------------------------------------------- if len(actions) == 0 or (len(actions) == 1 and "std" in actions): actions.append("replace") actions.append("tokenize") actions.append("numbers") actions.append("lower") actions.append("punct") if "replace" in actions: utt = self.replace(utt) if "tokenize" in actions: utt = self.tokenize(utt) if "numbers" in actions: utt = self.numbers(utt) if "lower" in actions: utt = self.lower(utt) utt = TextNormalizer.variants(utt) if "punct" in actions: utt = self.remove(utt, self.punct) return [sppasUnicode(s).to_strip() for s in utt]
# Until SPPAS 1.9.5: # result = "" # for s in utt: # s = sppasUnicode(s).to_strip() # result = result + " " + s.replace(" ", "_") # result = sppasUnicode(result).to_strip() # if len(result) == 0: # return "" # Nothing valid! # return result.replace(" ", self.delimiter) # -----------------------------------------------------------------------
[docs] @staticmethod def variants(utt): """Convert strings that are variants in the utterance. :param utt: (list) """ c = " ".join(utt) c = c.replace('{ ', '{') c = c.replace(' }', '}') c = c.replace(' | ', '|') inside = False cc = u("") for i, character in enumerate(c): if character == "{": inside = True elif character == "}": inside = False if inside is True: if character == " ": cc += u("_") else: cc += character else: cc += character return cc.split()