probabilistic_learning.py

"""Learning probabilistic models. (Chapters 20)"""

import heapq

from utils import weighted_sampler, product, gaussian


class CountingProbDist:
    """
    A probability distribution formed by observing and counting examples.
    If p is an instance of this class and o is an observed value, then
    there are 3 main operations:
    p.add(o) increments the count for observation o by 1.
    p.sample() returns a random element from the distribution.
    p[o] returns the probability for o (as in a regular ProbDist).
    """

    def __init__(self, observations=None, default=0):
        """
        Create a distribution, and optionally add in some observations.
        By default this is an unsmoothed distribution, but saying default=1,
        for example, gives you add-one smoothing.
        """
        if observations is None:
            observations = []
        self.dictionary = {}
        self.n_obs = 0
        self.default = default
        self.sampler = None

        for o in observations:
            self.add(o)

    def add(self, o):
        """Add an observation o to the distribution."""
        self.smooth_for(o)
        self.dictionary[o] += 1
        self.n_obs += 1
        self.sampler = None

    def smooth_for(self, o):
        """
        Include o among the possible observations, whether or not
        it's been observed yet.
        """
        if o not in self.dictionary:
            self.dictionary[o] = self.default
            self.n_obs += self.default
            self.sampler = None

    def __getitem__(self, item):
        """Return an estimate of the probability of item."""
        self.smooth_for(item)
        return self.dictionary[item] / self.n_obs

    # (top() and sample() are not used in this module, but elsewhere.)

    def top(self, n):
        """Return (count, obs) tuples for the n most frequent observations."""
        return heapq.nlargest(n, [(v, k) for (k, v) in self.dictionary.items()])

    def sample(self):
        """Return a random sample from the distribution."""
        if self.sampler is None:
            self.sampler = weighted_sampler(list(self.dictionary.keys()), list(self.dictionary.values()))
        return self.sampler()


def NaiveBayesLearner(dataset, continuous=True, simple=False):
    if simple:
        return NaiveBayesSimple(dataset)
    if continuous:
        return NaiveBayesContinuous(dataset)
    else:
        return NaiveBayesDiscrete(dataset)


def NaiveBayesSimple(distribution):
    """
    A simple naive bayes classifier that takes as input a dictionary of
    CountingProbDist objects and classifies items according to these distributions.
    The input dictionary is in the following form:
        (ClassName, ClassProb): CountingProbDist
    """
    target_dist = {c_name: prob for c_name, prob in distribution.keys()}
    attr_dists = {c_name: count_prob for (c_name, _), count_prob in distribution.items()}

    def predict(example):
        """Predict the target value for example. Calculate probabilities for each
        class and pick the max."""

        def class_probability(target_val):
            attr_dist = attr_dists[target_val]
            return target_dist[target_val] * product(attr_dist[a] for a in example)

        return max(target_dist.keys(), key=class_probability)

    return predict


def NaiveBayesDiscrete(dataset):
    """
    Just count how many times each value of each input attribute
    occurs, conditional on the target value. Count the different
    target values too.
    """

    target_vals = dataset.values[dataset.target]
    target_dist = CountingProbDist(target_vals)
    attr_dists = {(gv, attr): CountingProbDist(dataset.values[attr]) for gv in target_vals for attr in dataset.inputs}
    for example in dataset.examples:
        target_val = example[dataset.target]
        target_dist.add(target_val)
        for attr in dataset.inputs:
            attr_dists[target_val, attr].add(example[attr])

    def predict(example):
        """
        Predict the target value for example. Consider each possible value,
        and pick the most likely by looking at each attribute independently.
        """

        def class_probability(target_val):
            return (target_dist[target_val] * product(attr_dists[target_val, attr][example[attr]]
                                                      for attr in dataset.inputs))

        return max(target_vals, key=class_probability)

    return predict


def NaiveBayesContinuous(dataset):
    """
    Count how many times each target value occurs.
    Also, find the means and deviations of input attribute values for each target value.
    """
    means, deviations = dataset.find_means_and_deviations()

    target_vals = dataset.values[dataset.target]
    target_dist = CountingProbDist(target_vals)

    def predict(example):
        """Predict the target value for example. Consider each possible value,
        and pick the most likely by looking at each attribute independently."""

        def class_probability(target_val):
            prob = target_dist[target_val]
            for attr in dataset.inputs:
                prob *= gaussian(means[target_val][attr], deviations[target_val][attr], example[attr])
            return prob

        return max(target_vals, key=class_probability)

    return predict