Source code for calculus.infotheory.entropy
"""
:filename: sppas.src.calculus.infotheory.entropy.py
:author: Brigitte Bigi
:contact: develop@sppas.org
:summary: Entropy estimator.
.. _This file is part of SPPAS: http://www.sppas.org/
..
-------------------------------------------------------------------------
___ __ __ __ ___
/ | \ | \ | \ / the automatic
\__ |__/ |__/ |___| \__ annotation and
\ | | | | \ analysis
___/ | | | | ___/ of speech
Copyright (C) 2011-2021 Brigitte Bigi
Laboratoire Parole et Langage, Aix-en-Provence, France
Use of this software is governed by the GNU Public License, version 3.
SPPAS is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
SPPAS is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SPPAS. If not, see <http://www.gnu.org/licenses/>.
This banner notice must not be removed.
-------------------------------------------------------------------------
"""
from ..calculusexc import EmptyError, InsideIntervalError
from .utilit import log2
from .utilit import MAX_NGRAM
from .utilit import symbols_to_items
# ----------------------------------------------------------------------------
[docs]class sppasEntropy(object):
"""Entropy estimation.
Entropy is a measure of unpredictability of information content.
Entropy is one of several ways to measure diversity.
If we want to look at the entropy on a large series, we could also compute
the entropy for windows to measure the evenness or uncertainties.
By looking at the definition, one could predict the areas that have a
lot of variance would result in a higher entropy and the areas that have
lower variance would result in lower entropy.
"""
[docs] def __init__(self, symbols, n=1):
"""Create a sppasEntropy instance with a list of symbols.
:param symbols: (list) a vector of symbols of any type.
:param n: (int) n value for n-gram estimation. n ranges 1..MAX_NGRAM
"""
self._symbols = list()
self._ngram = 1
self.set_symbols(symbols)
self.set_ngram(n)
# -----------------------------------------------------------------------
[docs] def set_symbols(self, symbols):
"""Set the list of symbols.
:param symbols: (list) a vector of symbols of any type.
"""
if len(symbols) == 0:
raise EmptyError
self._symbols = symbols
# -----------------------------------------------------------------------
[docs] def set_ngram(self, n):
"""Set the n value of n-grams.
:param n: (int) n value for n-gram estimation. n ranges 1..8
"""
n = int(n)
if 0 < n <= MAX_NGRAM:
self._ngram = n
else:
raise InsideIntervalError(n, 1, MAX_NGRAM)
# -----------------------------------------------------------------------
[docs] def eval(self):
"""Estimate the Shannon entropy of a vector of symbols.
Shannon's entropy measures the information contained in a message as
opposed to the portion of the message that is determined
(or predictable).
:returns: (float) entropy value
"""
if len(self._symbols) == 0:
raise EmptyError
exr = symbols_to_items(self._symbols, self._ngram)
total = len(self._symbols) - self._ngram + 1
result = 0.
for symbol, occurrences in exr.items():
probability = 1.0 * occurrences / total
self_information = log2(1.0 / probability)
result += (probability * self_information)
return result