Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 75 additions & 7 deletions learning.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Learn to estimate functions from examples. (Chapters 18-20)"""

from utils import (
removeall, unique, product, mode, argmax, argmax_random_tie, isclose,
removeall, unique, product, mode, argmax, argmax_random_tie, isclose, gaussian,
dotproduct, vector_add, scalar_vector_product, weighted_sample_with_replacement,
weighted_sampler, num_or_str, normalize, clip, sigmoid, print_table, DataFile
)
Expand All @@ -11,7 +11,7 @@
import math
import random

from statistics import mean
from statistics import mean, stdev
from collections import defaultdict

# ______________________________________________________________________________
Expand Down Expand Up @@ -178,6 +178,45 @@ def remove_examples(self, value=""):
self.examples = [x for x in self.examples if value not in x]
self.update_values()

def split_values_by_classes(self):
"""Split values into buckets according to their class."""
buckets = defaultdict(lambda: [])
target_names = self.values[self.target]

for v in self.examples:
item = [a for a in v if a not in target_names] # Remove target from item
buckets[v[self.target]].append(item) # Add item to bucket of its class

return buckets

def find_means_and_deviations(self):
"""Finds the means and standard deviations of self.dataset.
means : A dictionary for each class/target. Holds a list of the means
of the features for the class.
deviations: A dictionary for each class/target. Holds a list of the sample
standard deviations of the features for the class."""
target_names = self.values[self.target]
feature_numbers = len(self.inputs)

item_buckets = self.split_values_by_classes()

means = defaultdict(lambda: [0 for i in range(feature_numbers)])
deviations = defaultdict(lambda: [0 for i in range(feature_numbers)])

for t in target_names:
# Find all the item feature values for item in class t
features = [[] for i in range(feature_numbers)]
for item in item_buckets[t]:
features = [features[i] + [item[i]] for i in range(feature_numbers)]

# Calculate means and deviations fo the class
for i in range(feature_numbers):
means[t][i] = mean(features[i])
deviations[t][i] = stdev(features[i])

return means, deviations


def __repr__(self):
return '<DataSet({}): {:d} examples, {:d} attributes>'.format(
self.name, len(self.examples), len(self.attrs))
Expand Down Expand Up @@ -267,15 +306,22 @@ def predict(example):
# ______________________________________________________________________________


def NaiveBayesLearner(dataset):
def NaiveBayesLearner(dataset, continuous=True):
if(continuous):
return NaiveBayesContinuous(dataset)
else:
return NaiveBayesDiscrete(dataset)


def NaiveBayesDiscrete(dataset):
"""Just count how many times each value of each input attribute
occurs, conditional on the target value. Count the different
target values too."""

targetvals = dataset.values[dataset.target]
target_dist = CountingProbDist(targetvals)
target_vals = dataset.values[dataset.target]
target_dist = CountingProbDist(target_vals)
attr_dists = {(gv, attr): CountingProbDist(dataset.values[attr])
for gv in targetvals
for gv in target_vals
for attr in dataset.inputs}
for example in dataset.examples:
targetval = example[dataset.target]
Expand All @@ -290,7 +336,29 @@ def class_probability(targetval):
return (target_dist[targetval] *
product(attr_dists[targetval, attr][example[attr]]
for attr in dataset.inputs))
return argmax(targetvals, key=class_probability)
return argmax(target_vals, key=class_probability)

return predict


def NaiveBayesContinuous(dataset):
"""Count how many times each target value occurs.
Also, find the means and deviations of input attribute values for each target value."""
means, deviations = dataset.find_means_and_deviations()

target_vals = dataset.values[dataset.target]
target_dist = CountingProbDist(target_vals)

def predict(example):
"""Predict the target value for example. Consider each possible value,
and pick the most likely by looking at each attribute independently."""
def class_probability(targetval):
prob = target_dist[targetval]
for attr in dataset.inputs:
prob *= gaussian(means[targetval][attr], deviations[targetval][attr], example[attr])
return prob

return argmax(target_vals, key=class_probability)

return predict

Expand Down
22 changes: 22 additions & 0 deletions tests/test_learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,20 @@ def test_weighted_replicate():
assert weighted_replicate('ABC', [1, 2, 1], 4) == ['A', 'B', 'B', 'C']


def test_means_and_deviation():
iris = DataSet(name="iris")

means, deviations = iris.find_means_and_deviations()

assert round(means["setosa"][0], 3) == 5.006
assert round(means["versicolor"][0], 3) == 5.936
assert round(means["virginica"][0], 3) == 6.588

assert round(deviations["setosa"][0], 3) == 0.352
assert round(deviations["versicolor"][0], 3) == 0.516
assert round(deviations["virginica"][0], 3) == 0.636


def test_plurality_learner():
zoo = DataSet(name="zoo")

Expand All @@ -48,6 +62,14 @@ def test_naive_bayes():
# Discrete
nBD = NaiveBayesLearner(iris)
assert nBD([5, 3, 1, 0.1]) == "setosa"
assert nBD([6, 5, 3, 1.5]) == "versicolor"
assert nBD([7, 3, 6.5, 2]) == "virginica"

# Continuous
nBC = NaiveBayesLearner(iris, continuous=True)
assert nBC([5, 3, 1, 0.1]) == "setosa"
assert nBC([6, 5, 3, 1.5]) == "versicolor"
assert nBC([7, 3, 6.5, 2]) == "virginica"


def test_k_nearest_neighbors():
Expand Down
6 changes: 6 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,12 @@ def test_sigmoid():
assert isclose(0.2689414213699951, sigmoid(-1))


def test_gaussian():
assert gaussian(1,0.5,0.7) == 0.6664492057835993
assert gaussian(5,2,4.5) == 0.19333405840142462
assert gaussian(3,1,3) == 0.3989422804014327


def test_step():
assert step(1) == step(0.5) == 1
assert step(0) == 1
Expand Down
4 changes: 4 additions & 0 deletions utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,10 @@ def step(x):
"""Return activation value of x with sign function"""
return 1 if x >= 0 else 0

def gaussian(mean, st_dev, x):
"""Given the mean and standard deviation of a distribution, it returns the probability of x."""
return 1/(math.sqrt(2*math.pi)*st_dev)*math.e**(-0.5*(float(x-mean)/st_dev)**2)


try: # math.isclose was added in Python 3.5; but we might be in 3.4
from math import isclose
Expand Down