Source code for annotations.Align.models.slm.arpaio

"""
:filename: sppas.src.annotations.Align.models.slm.arpaio.py
:author:   Brigitte Bigi
:contact:  develop@sppas.org
:summary:  I/O for ARPA models.

.. _This file is part of SPPAS: http://www.sppas.org/
..
    -------------------------------------------------------------------------

     ___   __    __    __    ___
    /     |  \  |  \  |  \  /              the automatic
    \__   |__/  |__/  |___| \__             annotation and
       \  |     |     |   |    \             analysis
    ___/  |     |     |   | ___/              of speech

    Copyright (C) 2011-2021  Brigitte Bigi
    Laboratoire Parole et Langage, Aix-en-Provence, France

    Use of this software is governed by the GNU Public License, version 3.

    SPPAS is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    SPPAS is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with SPPAS. If not, see <http://www.gnu.org/licenses/>.

    This banner notice must not be removed.

    -------------------------------------------------------------------------

"""

import codecs

from sppas.src.config import sg

from ..modelsexc import ModelsDataTypeError
from ..modelsexc import ArpaFileError

# ---------------------------------------------------------------------------


[docs]class sppasArpaIO(object): """ARPA statistical language models reader/writer. This class is able to load and save statistical language models from ARPA-ASCII files. """
[docs] def __init__(self): """Create a sppasArpaIO instance without model.""" self.__slm = None
# -----------------------------------------------------------------------
[docs] def set(self, slm): """Set the model of the sppasSLM. :param slm: (list) List of tuples for 1-gram, 2-grams, ... """ if not (isinstance(slm, list) and all([isinstance(m, list) for m in slm])): raise ModelsDataTypeError("slm", "list of lists of tuples", type(slm)) self.__slm = slm
# -----------------------------------------------------------------------
[docs] def load(self, filename): """Load a model from an ARPA file. :param filename: (str) Name of the file of the model. """ # we expect small models, so we can read the whole file in one! with codecs.open(filename, 'r', sg.__encoding__) as f: lines = f.readlines() self.__slm = list() n = 0 lm = [] for line in lines: line = line.strip() if len(line) == 0: pass elif line.startswith('\\end'): break elif line.startswith('\\') and "data" not in line: if n > 0: self.__slm.append(lm) n += 1 lm = [] elif n > 0: # split line into columns cols = line.split() if len(cols) < n+1: raise ArpaFileError(line) # probability is the first column proba = float(cols[0]) # the n- following columns are the ngram tokenseq = " ".join(cols[1:n+1]) # the last (optional) value is the bow bow = None if len(cols) > n+1: bow = float(cols[-1]) lm.append((tokenseq, proba, bow)) if n > 0: self.__slm.append(lm) return self.__slm
# -----------------------------------------------------------------------
[docs] def save(self, filename): r"""Save the model into a file, in ARPA-ASCII format. The ARPA format: \data\ ngram 1=nb1 ngram 2=nb2 . . . ngram N=nbN \1-grams: p(a_z) a_z bow(a_z) . . . \2-grams: p(a_z) a_z bow(a_z) . . . \n-grams: p(a_z) a_z . . . \end\ :param filename: (str) File where to save the model. """ if self.__slm is not None: with codecs.open(filename, 'w', sg.__encoding__) as f: f.write(self._serialize_slm())
# ----------------------------------------------------------------------- # Private # ----------------------------------------------------------------------- def _serialize_slm(self): """Serialize a model into a string, in ARPA-ASCII format. :returns: The ARPA-ASCII model as a string. """ result = self._serialize_header() for n, m in enumerate(self.__slm): new_ngram = sppasArpaIO._serialize_ngram(m, n+1) result = result + new_ngram result += sppasArpaIO._serialize_footer() return result # ----------------------------------------------------------------------- def _serialize_header(self): r"""Serialize the header of an ARPA file. \data\ ngram 1=nb1 ngram 2=nb2 ... ngram N=nbN """ r = "\\data\\ \n" for i, m in enumerate(self.__slm): r += "ngram " + str(i+1) + "=" + str(len(m)) + "\n" r += "\n" return r # ----------------------------------------------------------------------- @staticmethod def _serialize_ngram(model, order): r"""Serialize one of the ngrams of an ARPA file. \2-grams: p(a_z) a_z bow(a_z) ... """ r = "\\"+str(order)+"-grams: \n" for (wseq, lp, bo) in model: r += str(round(lp, 6)) + "\t" + wseq if bo is not None: r += "\t"+str(round(bo, 6)) r += "\n" r += "\n" return r # ----------------------------------------------------------------------- @staticmethod def _serialize_footer(): r"""Serialize the footer of an ARPA file. \end """ return "\\end\n"