Source code for annotations.Align.models.acm.acfeatures

"""
:filename: sppas.src.annotations.Align.models.acm.acfeatures.py
:author:   Brigitte Bigi
:contact:  develop@sppas.org
:summary:  Features of acoustic models.

.. _This file is part of SPPAS: http://www.sppas.org/
..
    -------------------------------------------------------------------------

     ___   __    __    __    ___
    /     |  \  |  \  |  \  /              the automatic
    \__   |__/  |__/  |___| \__             annotation and
       \  |     |     |   |    \             analysis
    ___/  |     |     |   | ___/              of speech

    Copyright (C) 2011-2021  Brigitte Bigi
    Laboratoire Parole et Langage, Aix-en-Provence, France

    Use of this software is governed by the GNU Public License, version 3.

    SPPAS is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    SPPAS is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with SPPAS. If not, see <http://www.gnu.org/licenses/>.

    This banner notice must not be removed.

    -------------------------------------------------------------------------

"""

import logging
import os

# ---------------------------------------------------------------------------

# audio speech file formats which can be read by the HWAVE module
SourceFormat = {
    "HTK": "The standard HTK file format",
    "TIMIT": "As used in the original prototype TIMIT CD-ROM",
    "NIST": "The standard SPHERE format used by the US NIST",
    "SCRIBE": "Subset of the European SAM standard used in the SCRIBE CD-ROM",
    "SDES1": "The Sound Designer 1 format defined by Digidesign Inc.",
    "AIFF": "Audio interchange file format",
    "SUNAU8": "Subset of 8bit .au and .snd formats used by Sun and NeXT",
    "OGI": "Format used by Oregan Graduate Institute similar to TIMIT",
    "WAVE": "Microsoft WAVE files used on PCs",
    "ESIG": "Entropic Esignal file format",
    "AUDIO": "Pseudo format to indicate direct audio input",
    "ALIEN": "Pseudo format to indicate unsupported file, the alien header "
             "size must be set via the environment variable HDSIZE",
    "NOHEAD": "As for the ALIEN format but header size is zero"
}

# basic parameter kinds supported by the HPARM module
SourceKind = {
    "WAVEFORM": "scalar samples (usually raw speech data)",
    "LPC": "linear prediction coefficients",
    "LPREFC": "linear prediction reflection coefficients",
    "LPCEPSTRA": "LP derived cepstral coefficients",
    "LPDELCEP": "LP cepstra + delta coef (obsolete)",
    "IREFC": "LPREFC stored as 16bit (short) integers",
    "MFCC": "mel-frequency cepstral coefficients",
    "FBANK": "log filter-bank parameters",
    "MELSPEC": "linear filter-bank parameters",
    "USER": "user defined parameters",
    "DISCRETE": "vector quantised codebook symbols",
    "ANON": "matches actual parameter kind"
}

# available qualifiers for parameter kinds.
# The first 6 of these are used to describe the target kind.
# The source kind may already have some of these,
# HPARM adds the rest as needed.
ParameterKind = {
    "_A": "Acceleration coefficients appended",
    "_C": "External form is compressed",
    "_D": "Delta coefficients appended",
    "_E": "Log energy appended",
    "_K": "External form has checksum appended",
    "_N": "Absolute log energy suppressed",
    "_V": "VQ index appended",
    "_Z": "Cepstral mean subtracted",
    "_0": "Cepstral C0 coefficient appended"
}

# all of the configuration parameters along with their meaning and
# default values
# See: http://www.ee.columbia.edu/ln/rosa/doc/HTKBook21/node78.html
# LINEIN     T     Select line input for audio
# MICIN     F     Select microphone input for audio
# LINEOUT     T     Select line output for audio
# SPEAKEROUT     F     Select speaker output for audio
# PHONESOUT     T     Select headphones output for audio
# SOURCEKIND     ANON     Parameter kind of source
# SOURCEFORMAT     HTK     File format of source
# SOURCERATE     0.0     Sample period of source in 100ns units
# NSAMPLES         Num samples in alien file input via a pipe
# HEADERSIZE         Size of header in an alien file
# STEREOMODE         Select channel: RIGHT or LEFT
# BYTEORDER         Define byte order VAX or other
# NATURALREADORDER     F     Enable natural read order for HTK files
# NATURALWRITEORDER     F     Enable natural write order for HTK files
# TARGETKIND     ANON     Parameter kind of target
# TARGETFORMAT     HTK     File format of target
# TARGETRATE     0.0     Sample period of target in 100ns units
# SAVECOMPRESSED     F     Save the output file in compressed form
# SAVEWITHCRC     T     Attach a checksum to output parameter file
# ADDDITHER     0.0     Level of noise added to input signal
# ZMEANSOURCE     F     Zero mean source waveform before analysis
# WINDOWSIZE     256000.0     Analysis window size in 100ns units
# USEHAMMING     T     Use a Hamming window
# PREEMCOEF     0.97     Set pre-emphasis coefficient
# LPCORDER     12     Order of LPC analysis
# NUMCHANS     20     Number of filterbank channels
# LOFREQ     -1.0     Low frequency cut-off in fbank analysis
# HIFREQ     -1.0     High frequency cut-off in fbank analysis
# USEPOWER     F     Use power not magnitude in fbank analysis
# NUMCEPS     12     Number of cepstral parameters
# CEPLIFTER     22     Cepstral liftering coefficient
# ENORMALISE     T     Normalise log energy
# ESCALE     0.1     Scale log energy
# SILFLOOR     50.0     Energy silence floor (dB)
# DELTAWINDOW     2     Delta window size
# ACCWINDOW     2     Acceleration window size
# VQTABLE     NULL     Name of VQ table
# SAVEASVQ     F     Save only the VQ indices
# AUDIOSIG     0     Audio signal number for remote control
# USESILDET     F     Enable speech/silence detector
# MEASURESIL     T     Measure background noise level prior to sampling
# OUTSILWARN     T     Print a warning message to stdout before measuring audio levels
# SPEECHTHRESH     9.0     Threshold for speech above silence level (dB)
# SILENERGY     0.0     Average background noise level (dB)
# SPCSEQCOUNT     10     Window over which speech/silence decision reached
# SPCGLCHCOUNT     0     Maximum number of frames marked as silence in window which is classified as speech whilst expecting start of speech
# SILSEQCOUNT     100     Number of frames classified as silence needed to mark end of utterance
# SILGLCHCOUNT     2     Maximum number of frames marked as silence in window which is classified as speech whilst expecting silence
# SILMARGIN     40     Number of extra frames included before and after start and end of speech marks from the speech/silence detector
# V1COMPAT     F     Set Version 1.5 compatibility mode
# TRACE     0     Trace setting

# ---------------------------------------------------------------------------


[docs]class sppasAcFeatures(object): """Acoustic model features. """
[docs] def __init__(self): """Create a sppasAcFeatures instance.""" self.sourcekind = "MFC" # either WAV or anything else! self.win_length_ms = 25 # The window length of the cepstral analysis in milliseconds self.win_shift_ms = 10 # The window shift of the cepstral analysis in milliseconds self.num_chans = 26 # Number of filterbank channels self.num_lift_ceps = 22 # Length of cepstral liftering self.num_ceps = 12 # The number of cepstral coefficients self.pre_em_coef = 0.97 # The coefficient used for the pre-emphasis self.targetkindw = "MFCC_0_D" # "MFCC_0_E" self.targetkind = "MFCC_0_D_N_Z" # "MFCC_E_D_A_Z" self.nbmv = 25 # The number of means and variances. It's commonly either 25 or 39. self.configfile = "" # the file for HVite self.mfcconfigfile = "" # the file for HCopy self.framerate = 16000 # Hz self.sampwidth = 2 # 16 bits
# -----------------------------------------------------------------------
[docs] def write_all(self, dirname): """Write all files at once. Write files with their default name, in the given directory. :param dirname: (str) a directory name (existing or to be created). """ if os.path.exists(dirname) is False: os.mkdir(dirname) self.write_mfcconfig(os.path.join(dirname, "mfc_config")) self.write_config(os.path.join(dirname, "config"))
# -----------------------------------------------------------------------
[docs] def write_config(self, filename): """Write the wav config into a file. :param filename: (str) Name of the file to save the features. """ logging.info('Write wav config file: %s ' % filename) with open(filename, "w") as fp: if self.sourcekind == "WAV": fp.write("SOURCEFORMAT = WAV\n") fp.write("SOURCEKIND = WAVEFORM\n") fp.write("SOURCERATE = %d\n" % ((1000./float(self.framerate))*10000)) fp.write("TARGETFORMAT = HTK\n") fp.write("TARGETKIND = %s\n" % self.targetkind) fp.write("TARGETRATE = %.1f\n" % (self.win_shift_ms*10000)) fp.write("SAVECOMPRESSED = T\n") fp.write("SAVEWITHCRC = T\n") fp.write("WINDOWSIZE = %.1f\n" % (self.win_length_ms*10000)) fp.write("USEHAMMING = T\n") fp.write("PREEMCOEF = %f\n" % self.pre_em_coef) fp.write("NUMCHANS = %d\n" % self.num_chans) fp.write("CEPLIFTER = %d\n" % self.num_lift_ceps) fp.write("NUMCEPS = %d\n" % self.num_ceps) fp.write("ENORMALISE = F\n") self.configfile = filename
# -----------------------------------------------------------------------
[docs] def write_mfcconfig(self, filename): """Write the wav config into a file. For HCopy only. :param filename: (str) Name of the file to save the features. """ logging.info('Write wav config file: %s ' % filename) with open(filename, "w") as fp: fp.write("SOURCEFORMAT = WAV\n") fp.write("SOURCEKIND = WAVEFORM\n") fp.write("SOURCERATE = %d\n" % ((1000./float(self.framerate))*10000)) fp.write("TARGETFORMAT = HTK\n") fp.write("TARGETKIND = %s\n" % self.targetkindw) fp.write("TARGETRATE = %.1f\n" % (self.win_shift_ms*10000)) fp.write("SAVECOMPRESSED = T\n") fp.write("SAVEWITHCRC = T\n") fp.write("WINDOWSIZE = %.1f\n" % (self.win_length_ms*10000)) fp.write("USEHAMMING = T\n") fp.write("PREEMCOEF = %f\n" % self.pre_em_coef) fp.write("NUMCHANS = %d\n" % self.num_chans) fp.write("CEPLIFTER = %d\n" % self.num_lift_ceps) fp.write("NUMCEPS = %d\n" % self.num_ceps) fp.write("ENORMALISE = F\n") self.mfcconfigfile = filename