Source code for calculus.stats.linregress

# -*- coding: UTF-8 -*-
"""
:filename: sppas.src.calculus.stats.linregress.py
:author: Brigitte Bigi
:contact: develop@sppas.org
:summary: Linear regression functions for python.

.. _This file is part of SPPAS: http://www.sppas.org/
..
    -------------------------------------------------------------------------

     ___   __    __    __    ___
    /     |  \  |  \  |  \  /              the automatic
    \__   |__/  |__/  |___| \__             annotation and
       \  |     |     |   |    \             analysis
    ___/  |     |     |   | ___/              of speech

    Copyright (C) 2011-2021  Brigitte Bigi
    Laboratoire Parole et Langage, Aix-en-Provence, France

    Use of this software is governed by the GNU Public License, version 3.

    SPPAS is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    SPPAS is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with SPPAS. If not, see <http://www.gnu.org/licenses/>.

    This banner notice must not be removed.

    -------------------------------------------------------------------------

The goal of linear regression is to fit a line to a set of points.
Equation of the line is y = mx + b
where m is slope, b is y-intercept.

"""

from .central import fmean
from .central import fsum

# ---------------------------------------------------------------------------


[docs]def compute_error_for_line_given_points(b, m, points): """Error function (also called a cost function). It measures how "good" a given line is. This function will take in a (m,b) pair and return an error value based on how well the line fits our data. To compute this error for a given line, we'll iterate through each (x,y) point in our data set and sum the square distances between each point's y value and the candidate line's y value (computed at mx + b). Lines that fit our data better (where better is defined by our error function) will result in lower error values. """ total_error = 0 for x, y in points: total_error += (y - (m * x + b)) ** 2 return total_error / float(len(points))
# ---------------------------------------------------------------------------
[docs]def step_gradient(b_current, m_current, points, learning_rate): """One step of a gradient linear regression. To run gradient descent on an error function, we first need to compute its gradient. The gradient will act like a compass and always point us downhill. To compute it, we will need to differentiate our error function. Since our function is defined by two parameters (m and b), we will need to compute a partial derivative for each. Each iteration will update m and b to a line that yields slightly lower error than the previous iteration. The learning_rate variable controls how large of a step we take downhill during each iteration. If we take too large of a step, we may step over the minimum. However, if we take small steps, it will require many iterations to arrive at the minimum. """ b_gradient = 0 m_gradient = 0 n = float(len(points)) for x, y in points: b_gradient += -(2./n) * (y - ((m_current * x) + b_current)) m_gradient += -(2./n) * x * (y - ((m_current * x) + b_current)) new_b = b_current - (learning_rate * b_gradient) new_m = m_current - (learning_rate * m_gradient) return [new_b, new_m]
# ---------------------------------------------------------------------------
[docs]def gradient_descent(points, starting_b, starting_m, learning_rate, num_iterations): """Gradient descent is an algorithm that minimizes functions. Given a function defined by a set of parameters, gradient descent starts with an initial set of parameter values and iteratively moves toward a set of parameter values that minimize the function. This iterative minimization is achieved using calculus, taking steps in the negative direction of the function gradient. :param points: a list of tuples (x,y) of float values. :param starting_b: (float) :param starting_m: (float) :param learning_rate: (float) :param num_iterations: (int) :returns: intercept, slope """ if len(points) == 0: return 0. b = starting_b m = starting_m for i in range(num_iterations): b, m = step_gradient(b, m, points, learning_rate) return b, m
# ---------------------------------------------------------------------------
[docs]def gradient_descent_linear_regression(points, num_iterations=50000): """Gradient descent method for linear regression. adapted from: http://spin.atomicobject.com/2014/06/24/gradient-descent-linear-regression/ :param points: a list of tuples (x,y) of float values. :param num_iterations: (int) :returns: intercept, slope """ g = gradient_descent(points, starting_b=0., # initial y-intercept guess starting_m=0., # initial slope guess learning_rate=0.0001, num_iterations=num_iterations) return g
# ---------------------------------------------------------------------------
[docs]def tga_linear_regression(points): """Linear regression as proposed in TGA, by Dafydd Gibbon. http://wwwhomes.uni-bielefeld.de/gibbon/TGA/ :param points: a list of tuples (x,y) of float values. :returns: intercept, slope """ if len(points) == 0: return 0. # Fix means mean_x = fmean([x for x, y in points]) mean_y = fmean([y for x, y in points]) xy_sum = 0. xsq_sum = 0. for x, y in points: dx = x - mean_x dy = y - mean_y xy_sum += (dx*dy) xsq_sum += (dx*dx) # Intercept m = xy_sum if xsq_sum != 0: m = xy_sum / xsq_sum # Slope b = mean_y - m * mean_x return b, m
# ---------------------------------------------------------------------------
[docs]def tansey_linear_regression(points): """Linear regression, as proposed in AnnotationPro. http://annotationpro.org/ Translated from C# code from here: https://gist.github.com/tansey/1375526 :param points: a list of tuples (x,y) of float values. :returns: intercept, slope """ if len(points) == 0: return 0. sum_x_sq = 0. sum_codeviates = 0. n = len(points) for x, y in points: sum_codeviates += (x*y) sum_x_sq += (x*x) sum_x = fsum([x for x, y in points]) sum_y = fsum([y for x, y in points]) mean_x = fmean([x for x, y in points]) mean_y = fmean([y for x, y in points]) ssx = sum_x_sq - ((sum_x*sum_x) / n) sco = sum_codeviates - ((sum_x * sum_y) / n) b = mean_y - ((sco / ssx) * mean_x) m = sco / ssx return b, m