Source code for annotations.Align.aligners.juliusalign

"""
:filename: sppas.src.annotations.Align.aligners.juliusalign.py
:author:   Brigitte Bigi
:contact:  develop@sppas.org
:summary:  Wrapper for Julius aligner.

.. _This file is part of SPPAS: <http://www.sppas.org/>
..
    ---------------------------------------------------------------------

     ___   __    __    __    ___
    /     |  \  |  \  |  \  /              the automatic
    \__   |__/  |__/  |___| \__             annotation and
       \  |     |     |   |    \             analysis
    ___/  |     |     |   | ___/              of speech

    Copyright (C) 2011-2021  Brigitte Bigi
    Laboratoire Parole et Langage, Aix-en-Provence, France

    Use of this software is governed by the GNU Public License, version 3.

    SPPAS is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    SPPAS is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with SPPAS. If not, see <http://www.gnu.org/licenses/>.

    This banner notice must not be removed.

    ---------------------------------------------------------------------

http://julius.sourceforge.jp/en_index.php

`Julius` is a high-performance, two-pass large vocabulary continuous
speech recognition (LVCSR) decoder software for speech-related researchers
and developers. Based on word N-gram and context-dependent HMM, it can
perform almost real-time decoding on most current PCs in 60k word dictation
task. Major search techniques are fully incorporated such as tree lexicon,
N-gram factoring, cross-word context dependency handling, enveloped beam
search, Gaussian pruning, Gaussian selection, etc.
Besides search efficiency, it is also modularized carefully to be independent
from model structures, and various HMM types are supported such as
shared-state triphones and tied-mixture models, with any number of mixtures,
states, or phones. Standard formats are adopted to cope with other free
modeling toolkit such as HTK, CMU-Cam SLM toolkit, etc.

The main platform is Linux and other Unix workstations, and also works on
Windows. Most recent version is developed on Linux and Windows (cygwin /
mingw), and also has Microsoft SAPI version. Julius is distributed with
open license together with source codes.

Julius has been developed as a research software for Japanese LVCSR since
1997, and the work was continued under IPA Japanese dictation toolkit
project (1997-2000), Continuous Speech Recognition Consortium, Japan (CSRC)
(2000-2003) and currently Interactive Speech Technology Consortium (ISTC).

"""

import os
import codecs
from subprocess import Popen, PIPE, STDOUT
import logging

from sppas.src.config import sg
from sppas.src.config import symbols
from sppas.src.config import sppasUnicode, u
from sppas.src.resources.dictpron import sppasDictPron
from sppas.src.annotations.Align.models.slm.ngramsmodel import sppasNgramsModel
from sppas.src.annotations.Align.models.slm.arpaio import sppasArpaIO
from sppas.src.annotations.Align.models.slm.ngramsmodel import START_SENT_SYMBOL, END_SENT_SYMBOL

from .basealigner import BaseAligner
from .alignerio import BaseAlignersReader

# ----------------------------------------------------------------------------

SIL_PHON = \
    list(symbols.phone.keys())[list(symbols.phone.values()).index("silence")]

# ----------------------------------------------------------------------------


[docs]class JuliusAligner(BaseAligner): """Julius automatic alignment system. JuliusAligner is able to align one audio segment that can be: - an inter-pausal unit, - an utterance, - a sentence... no longer than a few seconds. Things needed to run JuliusAligner: To perform speech segmentation with Julius, three "models" have to be prepared. The models should define the linguistic property of the language: recognition unit, audio properties of the unit and the linguistic constraint for the connection between the units. Typically the unit should be a word, and you should give Julius these models below: 1. "Acoustic model", which is a stochastic model of input waveform patterns, typically per phoneme. Format is HTK-ASCII model. 2. "Word dictionary", which defines vocabulary. 3. "Language model", which defines syntax level rules that defines the connection constraint between words. It should give the constraint for the acceptable or preferable sentence patterns. It can be: * either a rule-based grammar, * or probabilistic N-gram model. This class automatically construct the word dictionary and the language model from both: - the tokenization of speech, - the phonetization of speech. If outext is set to "palign", JuliusAligner will use a grammar and it will produce both phones and words alignments. If outext is set to "walign", JuliusAligner will use a slm and will produce words alignments only. """
[docs] def __init__(self, model_dir=None): """Create a JuliusAligner instance. :param model_dir: (str) Name of the directory of the acoustic model """ super(JuliusAligner, self).__init__(model_dir) self._extensions = ["palign", "walign"] self._outext = self._extensions[0] self._name = "julius"
# ------------------------------------------------------------------------
[docs] def set_outext(self, ext): """Set the extension for output files. :param ext: (str) Extension for output file name. """ ext = ext.lower() if ext not in self._extensions: raise ValueError("{:s} is not a valid file extension for " "JuliusAligner".format(ext)) self._outext = ext
# -----------------------------------------------------------------------
[docs] def gen_slm_dependencies(self, basename, N=3): """Generate the dependencies (slm, dictionary) for julius. :param basename: (str) base name of the slm and dictionary files :param N: (int) Language model N-gram length. """ dict_name = basename + ".dict" slm_name = basename + ".arpa" phoneslist = self._phones.split() tokenslist = self._tokens.split() dictpron = sppasDictPron() for token, pron in zip(tokenslist, phoneslist): for variant in pron.split("|"): dictpron.add_pron(token, variant.replace("-", " ")) if dictpron.is_unk(START_SENT_SYMBOL) is True: dictpron.add_pron(START_SENT_SYMBOL, SIL_PHON) if dictpron.is_unk(END_SENT_SYMBOL) is True: dictpron.add_pron(END_SENT_SYMBOL, SIL_PHON) dictpron.save_as_ascii(dict_name, False) # Write the SLM model = sppasNgramsModel(N) model.append_sentences([self._tokens]) probas = model.probabilities(method="logml") arpaio = sppasArpaIO() arpaio.set(probas) arpaio.save(slm_name)
# ------------------------------------------------------------------------
[docs] def gen_grammar_dependencies(self, basename): """Generate the dependencies (grammar, dictionary) for julius. :param basename: (str) base name of the grammar and dictionary files """ dict_name = basename + ".dict" grammar_name = basename + ".dfa" phoneslist = self._phones.split() tokenslist = self._tokens.split() token_idx = 0 nb_tokens = len(tokenslist)-1 with codecs.open(grammar_name, 'w', sg.__encoding__) as fdfa,\ codecs.open(dict_name, 'w', sg.__encoding__) as fdict: for token, pron in zip(tokenslist, phoneslist): # dictionary: for variant in pron.split("|"): fdict.write(str(token_idx)) fdict.write(" ["+token+"] ") fdict.write(variant.replace("-", " ")+"\n") # grammar: if token_idx == 0: fdfa.write("0 {:d} 1 0 1\n".format(nb_tokens)) else: fdfa.write(str(token_idx) + " " + str(nb_tokens) + " " + str(token_idx+1) + " 0 0\n") token_idx += 1 nb_tokens -= 1 # last line of the grammar fdfa.write("{:d} -1 -1 1 0\n".format(token_idx))
# ------------------------------------------------------------------------
[docs] def run_julius(self, inputwav, basename, outputalign): """Perform the speech segmentation. System call to the command `julius`. Given audio file must match the ones we used to train the acoustic model: PCM-WAV 16000 Hz, 16 bits :param inputwav: (str) audio input file name :param basename: (str) base name of grammar and dictionary files :param outputalign: (str) output file name """ if self._model is None: raise IOError('Julius aligner requires an acoustic model') # Fix file names tiedlist = os.path.join(self._model, "tiedlist") config = os.path.join(self._model, "config") # Fix file names and protect special characters. hmmdefs = '"' + \ os.path.join(self._model, "hmmdefs").replace('"', '\\"') + \ '"' output = '"' + outputalign.replace('"', '\\"') + '"' dictionary = '"' + basename.replace('"', '\\"') + ".dict" + '"' grammar = '"' + basename.replace('"', '\\"') + ".dfa" + '"' slm = '"' + basename.replace('"', '\\"') + ".arpa" + '"' # the command command = "echo " + inputwav + " | julius " # the global decoding parameters command += " -input file -gprune safe -iwcd1 max -smpFreq 16000" command += ' -multipath -iwsppenalty -70.0 -spmodel "sp"' command += " -b 1000 -b2 1000 -sb 1000.0 -m 10000 " # 1. the acoustic model command += " -h " + hmmdefs if os.path.isfile(tiedlist): command += " -hlist " + '"' + tiedlist.replace('"', '\\"') + '"' if os.path.isfile(config): # force Julius to use configuration file of HTK, by David Yeung command += " -htkconf " + '"' + config.replace('"', '\\"') + '"' # 2. the pronunciation dictionary command += " -v " + dictionary # 3. the language model if self._outext == "palign": # grammar-based forced-alignment command += " -looktrellis " command += " -palign" command += " -dfa " + grammar else: # slm-based speech recognition command += " -silhead " + '"' + START_SENT_SYMBOL + '"' command += " -siltail " + '"' + END_SENT_SYMBOL + '"' command += " -walign " command += " -nlr " + slm # options # if self._infersp is True: # inter-word short pause = on (append "sp" for each word tail) # command += ' -iwsp' # output of the command command += " > " + output # Execute the command p = Popen(command, shell=True, stdout=PIPE, stderr=STDOUT) p.wait() line = p.communicate() try: msg = u(" ").join([u(l.strip()) for l in line if l is not None]) if msg.startswith("b'"): msg = msg[2:-1] msg = msg.replace("enter filename->", "") msg = msg.replace("1 files processed", "") msg = msg.replace("..", "") msg = msg.replace("\\n", "") if msg.endswith("."): msg = msg[:-1] except UnicodeDecodeError: logging.error("An error occurred. See the logs.") raise UnicodeDecodeError("An error occurred.") # Julius not installed if u("not found") in msg: logging.error('julius returned the following message:') logging.error(line) raise OSError("julius command not found. " "See installation instructions for details.") # Bad command if u("-help") in msg: logging.error('julius returned the following message:') logging.error(line) raise OSError("Run of the julius command failed: {:s}".format(msg)) # Check output file if os.path.isfile(outputalign) is False: logging.error('Run of the julius command returned the following message:') logging.error(line) raise Exception("julius did not created an alignment file.") if len(msg) > 0: logging.info('Run of the julius command returned the following message:') logging.info(msg)
# -----------------------------------------------------------------------
[docs] def run_alignment(self, input_wav, output_align, N=3): """Execute the external program `julius` to align. The data related to the unit to time-align need to be previously fixed with: - set_phones(str) - set_tokens(str) Given audio file must match the ones we used to train the acoustic model: PCM-WAV 16000 Hz, 16 bits :param input_wav: (str) the audio input file name :param output_align: (str) the output file name :param N: (int) for N-grams, used only if SLM (i.e. outext=walign) :returns: (str) A message of `julius`. """ output_align = output_align + "." + self._outext basename = os.path.splitext(input_wav)[0] if self._outext == "palign": self.gen_grammar_dependencies(basename) else: self.gen_slm_dependencies(basename) self.run_julius(input_wav, basename, output_align) lines = BaseAlignersReader.get_lines(output_align) error_lines = "" message = "" entries = [] for line in lines: if line.startswith("Error: voca_load_htkdict") and \ "content" not in line and \ "not found" not in line and \ "missing" not in line: line = sppasUnicode(line).to_strip() columns = line.split(":") if len(columns) >= 3: tie = columns[2].split()[0] entries.append(tie) if len(entries) > 0: message = "SPPAS will try to add the following {:d} triphones in the acoustic model: \n{:s}\n".format(len(entries), "\n".join(entries)) added = self.add_tiedlist(entries) if len(added) == len(entries): message += "The acoustic model was modified. All the missing " \ "entries were successfully added in the model: " \ "{:s}.\nSPPAS calls Julius alignment system for a 2nd time." \ "\n".format(",".join(added)) self.run_julius(input_wav, basename, output_align) with codecs.open(output_align, 'r', sg.__encoding__) as f: lines = f.readlines() f.close() elif len(added) > 0: message += "The acoustic model was modified. " \ "The following entries were successfully added in the acoustic model: {:s}.\n" \ "However not all missing entries were added. " \ "Alignment can't be performed by 'Julius' aligner." \ "\n".format(added) else: message += "None of the entries were added in the acoustic model. " \ "Alignment can't be performed by 'Julius' aligner." \ "\n".format(added) for line in lines: line = line.strip() if line.lower().startswith("error:") and " line " not in line and line.endswith(".forward") is False: message += "Julius failed to align the transcription with the audio file.\n" error_lines += line if len(error_lines) > 0: raise Exception(message + error_lines) return message