Source code for annotations.StopWords.stpwds
"""
:filename: sppas.src.annotations.StopWords.stpwds.py
:author: Brigitte Bigi
:contact: develop@sppas.org
:summary: Stopwords detection.
.. _This file is part of SPPAS: http://www.sppas.org/
..
-------------------------------------------------------------------------
___ __ __ __ ___
/ | \ | \ | \ / the automatic
\__ |__/ |__/ |___| \__ annotation and
\ | | | | \ analysis
___/ | | | | ___/ of speech
Copyright (C) 2011-2021 Brigitte Bigi
Laboratoire Parole et Langage, Aix-en-Provence, France
Use of this software is governed by the GNU Public License, version 3.
SPPAS is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
SPPAS is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SPPAS. If not, see <http://www.gnu.org/licenses/>.
This banner notice must not be removed.
-------------------------------------------------------------------------
"""
from sppas.src.config import symbols
from sppas.src.config import IndexRangeException
from sppas.src.resources import sppasVocabulary
from sppas.src.resources import sppasUnigram
from ..annotationsexc import EmptyInputError
from ..annotationsexc import TooSmallInputError
# -----------------------------------------------------------------------
[docs]class StopWords(sppasVocabulary):
"""A vocabulary that can automatically evaluate a list of Stop-Words.
An entry 'w' is relevant for the speaker if its probability is less than
a threshold:
| P(w) <= 1 / (alpha * V)
where 'alpha' is an empirical coefficient and 'V' is the vocabulary
size of the speaker.
"""
MAX_ALPHA = 4.
MIN_ANN_NUMBER = 5
[docs] def __init__(self, case_sensitive=False):
"""Create a new StopWords instance.
:param case_sensitive: (bool) Considers the case of entries or not.
"""
super(StopWords, self).__init__(filename=None,
nodump=True,
case_sensitive=case_sensitive)
# Member
self.__alpha = 0.5
# Estimated values (from a given sppasTier)
self.__threshold = 0.
self.__v = 0.
# -----------------------------------------------------------------------
# Getters and setters
# -----------------------------------------------------------------------
[docs] def get_alpha(self):
"""Return the value of alpha coefficient (float)."""
return self.__alpha
[docs] def get_threshold(self):
"""Return the last estimated threshold (float)."""
return self.__threshold
[docs] def get_v(self):
"""Return the last estimated vocabulary size (int)."""
return self.__v
# ------------------------------------------------------------------------
[docs] def set_alpha(self, alpha):
"""Fix the alpha option.
Alpha is a coefficient to add specific stop-words in the list.
Default value is 0.5.
:param alpha: (float) Value in range [0..4]
"""
alpha = float(alpha)
if 0. < alpha <= self.MAX_ALPHA:
self.__alpha = alpha
else:
raise IndexRangeException(alpha, 0, StopWords.MAX_ALPHA)
# -----------------------------------------------------------------------
alpha = property(get_alpha, set_alpha)
# -----------------------------------------------------------------------
# Data management
# -----------------------------------------------------------------------
[docs] def copy(self):
"""Make a deep copy of the instance.
:returns: (StopWords)
"""
s = StopWords()
for i in self:
s.add(i)
s.set_alpha(self.__alpha)
return s
# -----------------------------------------------------------------------
[docs] def load(self, filename, merge=True):
"""Load a list of stop-words from a file.
:param filename: (str)
:param merge: (bool) Merge with the existing list (if True) or
delete the existing list (if False)
"""
if merge is False:
self.clear()
self.load_from_ascii(filename)
# -----------------------------------------------------------------------
[docs] def evaluate(self, tier=None, merge=True):
"""Add entries to the list of stop-words from the content of a tier.
Estimate if a token is relevant: if not it adds it in the stop-list.
:param tier: (sppasTier) A tier with entries to be analyzed.
:param merge: (bool) Merge with the existing list (if True) or
delete the existing list and create a new one (if False)
:returns: (int) Number of entries added into the list
:raises: EmptyInputError, TooSmallInputError
"""
if tier is None or tier.is_empty():
raise EmptyInputError(tier.get_name())
if len(tier) < StopWords.MIN_ANN_NUMBER:
raise TooSmallInputError(tier.get_name())
# Create the sppasUnigram from the best tag of each label
# and put data into a sppasUnigram to estimate frequencies
unigram = sppasUnigram()
for ann in tier:
for label in ann.get_labels():
# get the content of the best tag in 'str' type
tag = label.get_best()
content = tag.get_content()
if content not in symbols.all:
unigram.add(content)
# Fix values for the estimation of the relevance
self.__v = len(unigram)
self.__threshold = 1. / (self.__alpha * float(self.__v))
if merge is False:
self.clear()
# Estimate if a token is relevant: if not, add it in the stop-list
usum = float(unigram.get_sum())
nb = 0
for token in unigram.get_tokens():
p_w = float(unigram.get_count(token)) / usum
if p_w > self.__threshold:
self.add(token)
nb += 1
return nb