Source code for annotations.TextNorm.tokenize
# -*- coding: UTF-8 -*-
"""
:filename: sppas.src.annotations.TextNorm.tokenize.py
:author: Brigitte Bigi
:contact: develop@sppas.org
:summary: Tokenization module for the multilingual text norm system.
.. _This file is part of SPPAS: http://www.sppas.org/
..
-------------------------------------------------------------------------
___ __ __ __ ___
/ | \ | \ | \ / the automatic
\__ |__/ |__/ |___| \__ annotation and
\ | | | | \ analysis
___/ | | | | ___/ of speech
Copyright (C) 2011-2021 Brigitte Bigi
Laboratoire Parole et Langage, Aix-en-Provence, France
Use of this software is governed by the GNU Public License, version 3.
SPPAS is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
SPPAS is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SPPAS. If not, see <http://www.gnu.org/licenses/>.
This banner notice must not be removed.
-------------------------------------------------------------------------
"""
import re
from sppas.src.config import sppasUnicode
# ---------------------------------------------------------------------------
[docs]class sppasTokenSegmenter(object):
"""Create words from tokens on the basis of a lexicon.
This is a totally language independent method, based on a longest
matching algorithm to aggregate tokens into words. Words of a lexicon
are found and:
1/ unbind or not if they contain a separator character:
- rock'n'roll -> rock'n'roll
- I'm -> I 'm
- it's -> it 's
2/ bind using a character separator like for example, with '_':
- parce que -> parce_que
- rock'n roll -> rock'n_roll
"""
SEPARATOR = "_"
STICK_MAX = 7
# -------------------------------------------------------------------------
[docs] def __init__(self, vocab=None):
"""Create a new sppasTokenSegmenter instance.
:param vocab: (Vocabulary)
"""
self.__vocab = vocab
self.__separator = sppasTokenSegmenter.SEPARATOR
self.__aggregate_max = sppasTokenSegmenter.STICK_MAX
# -------------------------------------------------------------------------
[docs] def set_aggregate_max(self, value=STICK_MAX):
"""Fix the maximum number of words to stick.
This is a language dependant value. For French, it's 5 with the word:
"au fur et à mesure". But it can be more to stick phrases instead of
words for example.
:param value: (int) Maximum number of tokens to aggregate/stick.
"""
value = int(value)
if value < 1:
raise ValueError('set_aggregate_max: value should be > 0.')
if value > 100:
raise ValueError('set_aggregate_max: value should be < 100.')
self.__aggregate_max = value
# -------------------------------------------------------------------------
[docs] def set_separator(self, char=SEPARATOR):
"""Fix the character to separate tokens.
:param char: (char) Separator character. Can be an empty string.
"""
char = str(char)
if len(char) > 0:
char = char[0]
self.__separator = char
# -------------------------------------------------------------------------
def __stick_longest_lr(self, phrase, separator):
"""Return the longest first word of a phrase.
A longest matching algorithm is applied from left to right.
:param phrase: (str)
:returns: tuple of (index of the first longest token, the longest token)
"""
tab_toks = phrase.split(" ")
token = tab_toks[0]
i = len(tab_toks)
if self.__vocab is None:
return 1, token
while i > 0:
# try to aggregate all tokens
token = separator.join(tab_toks)
# next round will try without the last token
tab_toks.pop()
i -= 1
# find if this is a word in the vocabulary
if self.__vocab.is_unk(token) is False:
break
# the first real token is the first given token
return i, sppasUnicode(token).to_strip()
# -----------------------------------------------------------------------
[docs] def bind(self, utt):
"""Bind tokens of an utterance using a specific character.
:param utt: (list) List of tokens of an utterance (a transcription, a sentence, ...)
:returns: A list of strings
"""
new_utt = list()
idx_start = 0
while idx_start < len(utt):
# use a longest matching to aggregate the current token with the next ones
idx_end = min(len(utt), idx_start+self.__aggregate_max+1)
phrase = " ".join(utt[idx_start:idx_end])
idx_end, word = self.__stick_longest_lr(sppasUnicode(phrase).to_strip(), self.__separator)
new_utt.append(word)
idx_start += idx_end + 1
return new_utt
# -----------------------------------------------------------------------
[docs] def unbind(self, utt):
"""Unbind tokens containing - or ' or . depending on rules.
:param utt: (list) List of tokens of an utterance (a transcription, a sentence, ...)
:returns: A list of strings
"""
new_utt = list()
for tok in utt:
is_unknown = self.__vocab.is_unk(tok.lower().strip())
is_sampa = tok.startswith('/') and tok.endswith('/')
is_trunc = tok.endswith('-')
# a missing compound word?
# --> an unknown token
# --> containing a special character
# --> that is not a truncated word
# --> not in a sampa sequence!
if is_unknown is True \
and ("-" in tok or "'" in tok or "." in tok) \
and is_sampa is False\
and is_trunc is False:
# KEEP special chars in the array!
tab_split = re.split("([-'.])", tok)
tab_tok = list(entry for entry in tab_split if len(entry) > 0)
idx_start = 0
while idx_start < len(tab_tok):
# use a longest matching to aggregate the current token with the next ones
idx_end = min(len(tab_tok), idx_start + 5)
phrase = " ".join(tab_tok[idx_start:idx_end])
idx_end, word = self.__stick_longest_lr(sppasUnicode(phrase).to_strip(), "")
new_utt.append(word)
idx_start += idx_end + 1
else:
new_utt.append(sppasUnicode(tok).to_strip())
return new_utt