# -*- coding: UTF-8 -*-
"""
:filename: sppas.src.calculus.stats.frequency.py
:author: Brigitte Bigi
:contact: develop@sppas.org
:summary: A collection of basic frequency functions for python.
.. _This file is part of SPPAS: http://www.sppas.org/
..
-------------------------------------------------------------------------
___ __ __ __ ___
/ | \ | \ | \ / the automatic
\__ |__/ |__/ |___| \__ annotation and
\ | | | | \ analysis
___/ | | | | ___/ of speech
Copyright (C) 2011-2021 Brigitte Bigi
Laboratoire Parole et Langage, Aix-en-Provence, France
Use of this software is governed by the GNU Public License, version 3.
SPPAS is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
SPPAS is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with SPPAS. If not, see <http://www.gnu.org/licenses/>.
This banner notice must not be removed.
-------------------------------------------------------------------------
"""
import math
from ..calculusexc import EmptyError, ProbabilityError
# ---------------------------------------------------------------------------
[docs]def freq(mylist, item):
"""Return the relative frequency of an item of a list.
:param mylist: (list) list of elements
:param item: (any) an element of the list (or not!)
:returns: frequency (float) of item in mylist
"""
return float(mylist.count(item)) / float(len(mylist))
# ---------------------------------------------------------------------------
[docs]def percent(mylist, item):
"""Return the percentage of an item of a list.
:param mylist: (list) list of elements
:param item: (any) an element of the list (or not!)
:returns: percentage (float) of item in mylist
"""
return 100.0 * freq(mylist, item)
# ---------------------------------------------------------------------------
[docs]def percentile(mylist, p=(25, 50, 75), sort=True):
"""Return the pth percentile of an unsorted or sorted numeric list.
This is equivalent to calling quantile(mylist, p/100.0).
>>> round(percentile([15, 20, 40, 35, 50], 40), 2)
26.0
>>> for perc in percentile([15, 20, 40, 35, 50], (0, 25, 50, 75, 100)):
... print("{:.2f}".format(perc))
...
15.00
17.50
35.00
45.00
50.00
:param mylist: (list) list of elements.
:param p: (tuple) the percentile we are looking for.
:param sort: whether to sort the vector.
:returns: percentile as a float
"""
if hasattr(p, "__iter__"):
return quantile(mylist, (x/100.0 for x in p), sort)
return quantile(mylist, p/100.0, sort)
# ---------------------------------------------------------------------------
[docs]def quantile(mylist, q=(0.25, 0.5, 0.75), sort=True):
"""Return the qth quantile of an unsorted or sorted numeric list.
Calculates a rank n as q(N+1), where N is the number of items in mylist,
then splits n into its integer component k and decimal component d.
If k <= 1, returns the first element;
if k >= N, returns the last element;
otherwise returns the linear interpolation between
mylist[k-1] and mylist[k] using a factor d.
>>> round(quantile([15, 20, 40, 35, 50], 0.4), 2)
26.0
:param mylist: (list) list of elements.
:param q: (tuple) the quantile we are looking for.
:param sort: whether to sort the vector.
:returns: quantile as a float
"""
if len(mylist) == 0:
raise EmptyError
if sort is True:
mylist = sorted(mylist)
if hasattr(q, "__iter__"):
qs = q
return_single = False
else:
qs = [q]
return_single = True
for p in qs:
if p < 0. or p > 1.:
raise ProbabilityError(p)
result = list()
for p in qs:
n = float(p) * (len(mylist)+1)
k, d = int(n), n-int(n)
if k >= len(mylist):
result.append(mylist[-1])
elif k < 1:
result.append(mylist[0])
else:
result.append((1-d) * mylist[k-1] + d * mylist[k])
if return_single:
result = result[0]
return result
# ---------------------------------------------------------------------------
# NLP functions related to frequency
# ---------------------------------------------------------------------------
[docs]def hapax(mydict):
"""Return a list of hapax.
:param mydict: (dict)
:returns: list of keys for which value = 1
"""
return [k for k in mydict.keys() if mydict[k] == 1]
# ---------------------------------------------------------------------------
[docs]def occranks(mydict):
"""Return a dictionary with key=occurrence, value=rank.
:param mydict: (dict)
:returns: dict
"""
# how many occurrences of each value of mydict?
occ = dict()
for k in mydict:
v = mydict[k]
if v in occ:
occ[v] += 1
else:
occ[v] = 1
# ranking with the occurrence as key
occ_dict = dict()
for r, o in enumerate(reversed(sorted(occ.keys()))):
occ_dict[o] = r + 1
return occ_dict
# ---------------------------------------------------------------------------
[docs]def ranks(counter):
"""Return a dictionary with key=token, value=rank.
:param counter: (collections.Counter)
:returns: dict
"""
r = dict()
oclist = occranks(counter)
for k in counter.keys():
occ = counter[k]
r[k] = oclist[occ]
return r
# ---------------------------------------------------------------------------
[docs]def zipf(dict_ranks, item):
"""Return the Zipf Law value of an item.
Zipf's law states that given some corpus of natural language utterances,
the frequency of any word is inversely proportional to its rank in the
frequency table. Thus the most frequent word will occur approximately
twice as often as the second most frequent word, three times as often
as the third most frequent word, etc.
:param dict_ranks: (dict) is a dictionary with key=entry, value=rank.
:param item: (any) is an entry of the ranks dictionary
:returns: Zipf value or -1 if the entry is missing
"""
if item in dict_ranks:
return 0.1 / dict_ranks[item]
return -1
# ---------------------------------------------------------------------------
[docs]def tfidf(documents, item):
"""Return the tf.idf of an item.
Term frequency–inverse document frequency, is a numerical statistic
that is intended to reflect how important a word is to a document in a
collection or corpus. The tf.idf value increases proportionally to the
number of times a word appears in the document, but is offset by the
frequency of the word in the corpus, which helps to control for the fact
that some words are generally more common than others.
:param documents: a list of list of entries.
:param item:
:returns: float
"""
# Estimate tf of item in the corpus
alltokens = []
for d in documents:
alltokens.extend(d)
tf = freq(alltokens, item)
# number of documents in the corpus
D = len(documents)
# number of documents with at least one occurrence of item
dw = 0.
for d in documents:
if item in d:
dw += 1.
if dw == 0.:
return 0.
return tf * (math.log(D / dw))