Source code for annotations.TextNorm.splitter

# -*- coding: UTF-8 -*-
"""
:filename: sppas.src.annotations.TextNorm.splitter.py
:author:   Brigitte Bigi
:contact:  develop@sppas.org
:summary:  Split step of the normalization automatic annotation.

.. _This file is part of SPPAS: http://www.sppas.org/
..
    -------------------------------------------------------------------------

     ___   __    __    __    ___
    /     |  \  |  \  |  \  /              the automatic
    \__   |__/  |__/  |___| \__             annotation and
       \  |     |     |   |    \             analysis
    ___/  |     |     |   | ___/              of speech

    Copyright (C) 2011-2021  Brigitte Bigi
    Laboratoire Parole et Langage, Aix-en-Provence, France

    Use of this software is governed by the GNU Public License, version 3.

    SPPAS is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    SPPAS is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with SPPAS. If not, see <http://www.gnu.org/licenses/>.

    This banner notice must not be removed.

    -------------------------------------------------------------------------

"""

import re

from sppas.src.resources.dictrepl import sppasDictRepl
from sppas.src.config.makeunicode import u

from .language import sppasLangISO

# ---------------------------------------------------------------------------


[docs]class sppasSimpleSplitter(object): """Utterance splitter Module to split a string for the multilingual text normalization system. Split an utterance into tokens using whitespace or characters. Should be extended to properly split telephone numbers or dates, etc. (for written texts). """
[docs] def __init__(self, lang, dict_replace=None, speech=True): """Creates a sppasSimpleSplitter. :param lang: the language code in iso639-3. :param dict_replace: Replacement dictionary :param speech: (bool) split transcribed speech vs written text """ self.__lang = lang self.__speech = speech if dict_replace is not None: self.__repl = dict_replace else: self.__repl = sppasDictRepl(None)
# -----------------------------------------------------------------------
[docs] def split_characters(self, utt): """Split an utterance by characters. :param utt: (str) the utterance (a transcription, a sentence, ...) in utf-8 :returns: A string (split character by character, using whitespace) """ y = u(utt) tmp = " ".join(y) # split all characters except numbers and ascii characters sstr = re.sub(u("([0-90-9a-zA-ZA-T\s]+\.?[0-90-9a-zA-ZA-T\s]+)"), lambda o: u(" %s " % o.group(0).replace(" ", "")), tmp) # and dates... if self.__speech is False: sstr = re.sub(u("([0-90-9\s]+\.?[月年日\s]+)"), lambda o: u(" %s " % o.group(0).replace(" ", "")), sstr) # and ・ sstr = re.sub(u('[\s]*・[\s]*'), u("・"), sstr) return sstr
# -----------------------------------------------------------------------
[docs] def split(self, utt): """Split an utterance using whitespace. If the language is character-based, split each character. :param utt: (str) an utterance of a transcription, a sentence, ... :param std: (bool) :returns: A list (array of string) """ s = utt if sppasLangISO.without_whitespace(self.__lang) is True: s = self.split_characters(s) toks = list() for t in s.split(): # if not a phonetized entry if t.startswith("/") is False and t.endswith("/") is False: if sppasLangISO.without_whitespace(self.__lang) is False: # Split numbers if stick to characters # attention: do not replace [a-zA-Z] by [\w] (because \w includes numbers) # and not on Asian languages: it can be a tone! t = re.sub(u('([0-9])([a-zA-Z])'), u(r'\1 \2'), t) t = re.sub(u('([a-zA-Z])([0-9])'), u(r'\1 \2'), t) # Split some punctuation t = re.sub(u('\\[\\]'), u(r'\\] \\['), t) # Split dots if stick to the beginning of a word # info: a dot at the end of a word is analyzed by the tokenizer t = re.sub(u(' \.([\w-])'), u(r' . \1'), t) t = re.sub(u('^\.([\w-])'), u(r' . \1'), t) # Split replacement characters for r in self.__repl: if t.endswith(r): t = t[:-len(r)] t = t + ' ' + r toks.append(t.strip()) # s = " ".join(toks) # Then split each time there is a space and return result # s = sppasUnicode(s).to_strip() return s.split()