Adds linear_regression().

sheriferson · Jul 30, 2016 · d9da51e · d9da51e
1 parent db98bd4
commit d9da51e
Show file tree

Hide file tree

Showing 5 changed files with 89 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -85,6 +85,12 @@ nosetests --with-doctest
 | Standard deviation             | `standard_deviation([1, 2, 3])`                                      |
 | Standard scores (z-scores)     | `z_scores([-2, -1, 0, 1, 2])`                                        |
 
+#### Linear regression
+
+| Function                 | Example                                                     |
+|--------------------------|-------------------------------------------------------------|
+| Simple linear regression | `linear_regression([1, 2, 3, 4, 5], [4, 4.5, 5.5, 5.3, 6])` |
+
 #### Similarity
 
 | Function                       | Example                                                              |
@@ -103,7 +109,7 @@ nosetests --with-doctest
 
 ### Spirit and rules
 
-- Everything should be implemented in raw, organic, locally sourced python.
+- Everything should be implemented in raw, organic, locally sourced Python.
 - Use libraries only if you have to and only when unrelated to the math/statistics. For example, `from functools import reduce` to make `reduce` available for those using python3. That's okay, because it's about making Python work and not about making the stats easier.
 - It's okay to use operators and functions if they correspond to regular calculator buttons. For example, all calculators have a built-in square root function, so there is no need to implement that ourselves, we can use `math.sqrt()`.
 Anything beyond that, like `mean`, `median`, we have to write ourselves.

diff --git a/changelog.txt b/changelog.txt
@@ -12,6 +12,8 @@ Date TBD
 - `normal()` to calculate normal distribution probabilities
 - `kurtosis()` to calculate kurtosis/"tailedness" of a probability distribution of a variable
 _ `skew()` to calculate Pearson's moment coefficient of skewness
+- `linear_regression()` to calculate slope (m) and y intercept (b) of line of
+best fit.
 
 
 ### Improved

diff --git a/simplestatistics/__init__.py b/simplestatistics/__init__.py
@@ -21,6 +21,9 @@
 from .statistics.correlate import correlate
 from .statistics.z_scores import z_scores
 
+# linear regression
+from .statistics.linear_regression import linear_regression
+
 # distributions
 from .statistics.factorial import factorial
 from .statistics.choose import choose

diff --git a/simplestatistics/index.rst b/simplestatistics/index.rst
@@ -143,6 +143,11 @@ Standard scores (z scores)
 
 .. autofunction:: simplestatistics.z_scores
 
+Linear regression
+-----------------
+
+.. autofunction:: simplestatistics.linear_regression
+
 Similarity
 ----------
 

diff --git a/simplestatistics/statistics/linear_regression.py b/simplestatistics/statistics/linear_regression.py
@@ -0,0 +1,72 @@
+# I need sane division that returns a float not int
+# from __future__ import division
+
+from decimal import *
+
+from .decimalize import decimalize
+from .mean import mean
+from .product import product
+from .sum import sum
+
+def linear_regression(x, y):
+    """
+    This is a `simple linear regression`_ that finds the line of best fit based on
+    a set of points. It uses the least sum of squares to find the slope (:math:`m`)
+    and y-intercept (:math:`b`).
+
+    .. _`simple linear regression`: https://en.wikipedia.org/wiki/Linear_regression
+
+    Equation:
+        .. math::
+            m = \\frac{\\bar{X}\\bar{Y} - \\bar{XY}}{(\\bar{X})^2 - \\bar{X^2}}
+
+            b = \\bar{Y} - m\\bar{X}
+
+    Where:
+        - :math:`m` is the slope.
+        - :math:`b` is the y intercept.
+
+    Returns:
+        A tuple of two values: (m, b), where m is the slope and b is the y intercept.
+
+    Examples:
+
+        >>> linear_regression([1, 2, 3, 4, 5], [4, 4.5, 5.5, 5.3, 6])
+        (0.48, 3.62)
+        >>> linear_regression([1, 2, 3, 4, 5], [2, 2.9, 3.95, 5.1, 5.9])
+        (1.0, 0.97)
+        >>> linear_regression((1, 2), (3, 3.5))
+        (0.5, 2.5)
+        >>> linear_regression([1], [2])
+        (None, 2)
+        >>> linear_regression(4, 5)
+        >>> linear_regression([1, 2], [5])
+        Traceback (most recent call last):
+            ...
+        ValueError: The two variables have to have the same length.
+    """
+
+    if type(x) not in [list, tuple] or type(y) not in [list, tuple]:
+        return(None)
+    elif len(x) != len(y):
+        raise ValueError('The two variables have to have the same length.')
+    elif len(x) == 1 or len(y) == 1:
+        return((None, y[0]))
+
+    mean_x = mean(x)
+    mean_y = mean(y)
+    mean_xy = mean(product(x, y))
+
+    x2 = [pow(xi, 2) for xi in x]
+    mean_x2 = mean(x2)
+
+    # calculate slope
+    numerator = ( mean_x * mean_y ) - mean_xy
+    denomerator = pow(mean_x, 2) - mean_x2
+
+    m = Decimal(numerator) / Decimal(denomerator) # slope
+
+    # calculate y intercept
+    b = Decimal(mean_y) - (m * Decimal(mean_x))
+
+    return(round(m, 2), round(b, 2))