SPPAS 4.22

https://sppas.org/

Module sppas.src.annotations

Class sppasTextNorm

Description

Text normalization automatic annotation.

Constructor

Create a sppasTextNorm instance without any linguistic resources.

Parameters
  • log: (sppasLog) Human-readable logs.
View Source
def __init__(self, log=None):
    """Create a sppasTextNorm instance without any linguistic resources.

    :param log: (sppasLog) Human-readable logs.

    """
    super(sppasTextNorm, self).__init__('textnorm.json', log)
    self.__normalizer = TierNormalizer(logfile=log)

Public functions

load_resources

Fix the list of words of a given language.

It allows a better tokenization, and enables the language-dependent modules like num2letters.

Parameters
  • vocab_filename: (str) File with the orthographic transcription
  • lang: (str) the language code
View Source
def load_resources(self, vocab_filename, lang='und', **kwargs):
    """Fix the list of words of a given language.

        It allows a better tokenization, and enables the language-dependent
        modules like num2letters.

        :param vocab_filename: (str) File with the orthographic transcription
        :param lang: (str) the language code

        """
    if os.path.isfile(vocab_filename) is True:
        voc = sppasVocabulary(vocab_filename)
    else:
        voc = sppasVocabulary()
        logging.warning('Vocabulary file {:s} for language {:s} not found.'.format(vocab_filename, lang))
    self.__normalizer = TierNormalizer(voc, lang, logfile=self.logfile)
    self.logfile.print_message(info(1164, 'annotations').format(len(voc)), indent=0)
    replace_filename = os.path.join(paths.resources, 'repl', lang + '.repl')
    if os.path.isfile(replace_filename) is True:
        dict_replace = sppasDictRepl(replace_filename, nodump=True)
    else:
        dict_replace = sppasDictRepl()
        logging.warning('Replacement vocabulary not found.')
    self.__normalizer.set_repl(dict_replace)
    self.logfile.print_message(info(1166, 'annotations').format(len(dict_replace)), indent=0)
    punct_filename = os.path.join(paths.resources, 'vocab', 'Punctuations.txt')
    if os.path.isfile(punct_filename) is True:
        vocab_punct = sppasVocabulary(punct_filename, nodump=True)
    else:
        vocab_punct = sppasVocabulary()
    self.__normalizer.set_punct(vocab_punct)
    number_filename = os.path.join(paths.resources, 'num', lang.lower() + '_num.repl')
    if os.path.exists(number_filename) is True:
        numbers = sppasDictRepl(number_filename, nodump=True)
    else:
        numbers = sppasDictRepl()
        logging.warning('Dictionary of numbers not found.')
    self.__normalizer.set_num(numbers)

fix_options

Fix all options. Available options are:

  • faked
  • std
  • custom
Parameters
  • options: (sppasOption)
View Source
def fix_options(self, options):
    """Fix all options. Available options are:

            - faked
            - std
            - custom

        :param options: (sppasOption)

        """
    for opt in options:
        key = opt.get_key()
        if key == 'faked':
            self.set_faked(opt.get_value())
        elif key == 'std':
            self.set_std(opt.get_value())
        elif key == 'custom':
            self.set_custom(opt.get_value())
        elif key == 'occ_dur':
            self.set_occ_dur(opt.get_value())
        elif key == 'tok_speech':
            self.set_tok_not_sil(opt.get_value())
        elif 'pattern' in key:
            self._options[key] = opt.get_value()
        else:
            raise AnnotationOptionError(key)

set_faked

Fix the faked option.

Parameters
  • value: (bool) Create a faked tokenization
View Source
def set_faked(self, value):
    """Fix the faked option.

        :param value: (bool) Create a faked tokenization

        """
    self._options['faked'] = value

set_std

Fix the std option.

Parameters
  • value: (bool) Create a standard tokenization
View Source
def set_std(self, value):
    """Fix the std option.

        :param value: (bool) Create a standard tokenization

        """
    self._options['std'] = value

set_custom

Fix the custom option.

Parameters
  • value: (bool) Create a customized tokenization
View Source
def set_custom(self, value):
    """Fix the custom option.

        :param value: (bool) Create a customized tokenization

        """
    self._options['custom'] = value

set_occ_dur

Fix the occurrences and duration tiers generation option.

Parameters
  • value: (bool) Create a tier with nb of tokens and duration
View Source
def set_occ_dur(self, value):
    """Fix the occurrences and duration tiers generation option.

        :param value: (bool) Create a tier with nb of tokens and duration

        """
    self._options['occ_dur'] = value

set_tok_not_sil

Fix the token is speech tiers generation option.

Parameters
  • value: (bool) nb of tokens estimated without not-events ones
View Source
def set_tok_not_sil(self, value):
    """Fix the token is speech tiers generation option.

        :param value: (bool) nb of tokens estimated without not-events ones

        """
    self._options['tok_speech'] = value

convert

Text normalization of all labels of a tier.

Parameters
  • tier: (sppasTier) the orthographic transcription (standard or EOT)
Returns
  • A tuple with 3 tiers named - "Tokens-Faked", - "Tokens-Std", - "Tokens-Custom"
View Source
def convert(self, tier):
    """Text normalization of all labels of a tier.

        :param tier: (sppasTier) the orthographic transcription (standard or EOT)
        :returns: A tuple with 3 tiers named:
            - "Tokens-Faked",
            - "Tokens-Std",
            - "Tokens-Custom"

        """
    if tier is None:
        raise IOError('No tier found.')
    if tier.is_empty() is True:
        raise EmptyInputError(name=tier.get_name())
    tokens_faked = None
    if self._options['faked'] is True:
        actions = ['replace', 'tokenize', 'numbers', 'lower', 'punct']
        tokens_faked = self.__normalizer.normalize_tier(tier, actions)
        tokens_faked.set_name('Tokens')
    tokens_std = None
    if self._options['std'] is True:
        actions = ['std', 'replace', 'tokenize', 'numbers', 'lower', 'punct']
        tokens_std = self.__normalizer.normalize_tier(tier, actions)
        tokens_std.set_name('Tokens-Std')
    tokens_custom = None
    if self._options['custom'] is True:
        actions = ['std', 'tokenize']
        tokens_custom = self.__normalizer.normalize_tier(tier, actions)
        tokens_custom.set_name('Tokens-Custom')
    if tokens_faked is not None and tokens_std is not None:
        self.__force_align_tiers(tokens_std, tokens_faked)
    return (tokens_faked, tokens_std, tokens_custom)

occ_dur

Create a tier with labels and duration of each annotation.

Parameters
  • tier
View Source
def occ_dur(self, tier):
    """Create a tier with labels and duration of each annotation.

        :param tier:

        """
    occ = sppasTier('Occ%s' % tier.get_name())
    dur = sppasTier('Dur%s' % tier.get_name())
    for ann in tier:
        labels = ann.get_labels()
        nb_occ = len(labels)
        if self._options['tok_speech'] is True:
            nb_occ = 0
            for label in labels:
                if label.get_best().is_speech() is True:
                    nb_occ += 1
        location = ann.get_location()
        duration = location.get_best().duration().get_value()
        occ.create_annotation(location.copy(), sppasLabel(sppasTag(nb_occ, tag_type='int')))
        dur.create_annotation(ann.get_location().copy(), sppasLabel(sppasTag(round(duration, 4), tag_type='float')))
    return (occ, dur)

get_inputs

Return the the tier with aligned tokens.

Parameters
  • input_files: (list)
Raises

NoTierInputError

Returns
  • (sppasTier)
View Source
def get_inputs(self, input_files):
    """Return the the tier with aligned tokens.

        :param input_files: (list)
        :raise: NoTierInputError
        :return: (sppasTier)

        """
    tier = None
    annot_ext = self.get_input_extensions()
    for filename in input_files:
        if filename is None:
            continue
        fn, fe = os.path.splitext(filename)
        if tier is None and fe in annot_ext[0]:
            parser = sppasTrsRW(filename)
            trs_input = parser.read()
            tier = sppasFindTier.transcription(trs_input)
            if tier is not None:
                if self.logfile:
                    self.logfile.print_message('Input tier to be normalized: {}'.format(tier.get_name()), indent=1)
                return tier
    logging.error('No tier with an orthographic transcription was found.')
    raise NoTierInputError

run

Run the automatic annotation process on an input.

Parameters
  • input_files: (list of str) orthographic transcription
  • output: (str) the output file name
Returns
  • (sppasTranscription)
View Source
def run(self, input_files, output=None):
    """Run the automatic annotation process on an input.

        :param input_files: (list of str) orthographic transcription
        :param output: (str) the output file name
        :returns: (sppasTranscription)

        """
    tier_input = self.get_inputs(input_files)
    tier_faked_tokens, tier_std_tokens, tier_custom = self.convert(tier_input)
    trs_output = sppasTranscription(self.name)
    if tier_faked_tokens is not None:
        trs_output.append(tier_faked_tokens)
    if tier_std_tokens is not None:
        trs_output.append(tier_std_tokens)
    if tier_custom is not None:
        trs_output.append(tier_custom)
    if len(trs_output) > 0:
        if self._options['occ_dur'] is True:
            tier_occ, tier_dur = self.occ_dur(trs_output[0])
            trs_output.append(tier_occ)
            trs_output.append(tier_dur)
    trs_output.set_meta('text_normalization_result_of', input_files[0])
    trs_output.set_meta('language_iso', 'iso639-3')
    trs_output.set_meta('language_name_0', 'Undetermined')
    if len(self.__normalizer.lang) == 3:
        trs_output.set_meta('language_code_0', self.__normalizer.lang)
        trs_output.set_meta('language_url_0', 'https://iso639-3.sil.org/code/' + self.__normalizer.lang)
    else:
        trs_output.set_meta('language_code_0', 'und')
        trs_output.set_meta('language_url_0', 'https://iso639-3.sil.org/code/und')
    if output is not None:
        if len(trs_output) > 0:
            output_file = self.fix_out_file_ext(output)
            parser = sppasTrsRW(output_file)
            parser.write(trs_output)
            return [output_file]
        else:
            raise EmptyOutputError
    return trs_output

get_output_pattern

Pattern this annotation uses in an output filename.

View Source
def get_output_pattern(self):
    """Pattern this annotation uses in an output filename."""
    return self._options.get('outputpattern', '-token')

Protected functions

__force_align_tiers

Force standard spelling and faked spelling to share the same number of tokens.

Parameters
  • std_tier: (sppasTier)
  • faked_tier: (sppasTier)
View Source
def __force_align_tiers(self, std_tier, faked_tier):
    """Force standard spelling and faked spelling to share the same
        number of tokens.

        :param std_tier: (sppasTier)
        :param faked_tier: (sppasTier)

        """
    if self._options['std'] is False:
        return
    i = 0
    for ann_std, ann_faked in zip(std_tier, faked_tier):
        i += 1
        for label_std, label_faked in zip(ann_std.get_labels(), ann_faked.get_labels()):
            for (text_std, s1), (text_faked, s2) in zip(label_std, label_faked):
                try:
                    texts, textf = self.__align_tiers(text_std.get_content(), text_faked.get_content())
                    text_std.set_content(texts)
                    text_faked.set_content(textf)
                except:
                    self.logfile.print_message('Standard/Faked tokens matching error, at interval {:d}\n'.format(i), indent=2, status=1)
                    self.logfile.print_message(text_std.get_content(), indent=3)
                    self.logfile.print_message(text_faked.get_content(), indent=3)
                    self.logfile.print_message('Fall back on faked.', indent=3, status=3)
                    text_std.set_content(text_faked.get_content())

__align_tiers

Align standard spelling tokens with faked spelling tokens.

Parameters
  • std: (str)
  • faked: (str)
Returns
  • a tuple of std and faked
View Source
def __align_tiers(self, std, faked):
    """Align standard spelling tokens with faked spelling tokens.

        :param std: (str)
        :param faked: (str)
        :returns: a tuple of std and faked

        """
    stds = std.split()
    fakeds = faked.split()
    if len(stds) == len(fakeds):
        return (std, faked)
    tmp = []
    for f in fakeds:
        toks = f.split('_')
        for t in toks:
            tmp.append(t)
    fakeds = tmp[:]
    num_tokens = len(stds)
    i = 0
    while i < num_tokens:
        if "'" in stds[i]:
            if not stds[i].endswith("'") and fakeds[i].endswith("'"):
                fakeds[i] = fakeds[i] + fakeds[i + 1]
                del fakeds[i + 1]
        if '-' in stds[i]:
            if not stds[i].endswith('-') and '-' not in fakeds[i]:
                fakeds[i] = fakeds[i] + fakeds[i + 1]
                del fakeds[i + 1]
        num_underscores = stds[i].count('_')
        if num_underscores > 0:
            if not self.__normalizer.vocab.is_unk(stds[i]):
                n = num_underscores + 1
                fakeds[i] = '_'.join(fakeds[i:i + n])
                del fakeds[i + 1:i + n]
        i += 1
    if len(stds) != len(fakeds):
        raise ValueError
    return (std, ' '.join(fakeds))