@@ -142,6 +142,57 @@ def parse_csv(input, delim=','):
142142
143143#______________________________________________________________________________
144144
145+ class CountingProbDist :
146+ """A probability distribution formed by observing and counting examples.
147+ If p is an instance of this class and o is an observed value, then
148+ there are 3 main operations:
149+ p.add(o) increments the count for observation o by 1.
150+ p.sample() returns a random element from the distribution.
151+ p[o] returns the probability for o (as in a regular ProbDist)."""
152+
153+ def __init__ (self , observations = [], default = 0 ):
154+ """Create a distribution, and optionally add in some observations.
155+ By default this is an unsmoothed distribution, but saying default=1,
156+ for example, gives you add-one smoothing."""
157+ update (self , dictionary = {}, n_obs = 0.0 , default = default , sampler = None )
158+ for o in observations :
159+ self .add (o )
160+
161+ def add (self , o ):
162+ "Add an observation o to the distribution."
163+ self .smooth_for (o )
164+ self .dictionary [o ] += 1
165+ self .n_obs += 1
166+ self .sampler = None
167+
168+ def smooth_for (self , o ):
169+ """Include o among the possible observations, whether or not
170+ it's been observed yet."""
171+ if o not in self .dictionary :
172+ self .dictionary [o ] = self .default
173+ self .n_obs += self .default
174+ self .sampler = None
175+
176+ def __getitem__ (self , item ):
177+ "Return an estimate of the probability of item."
178+ self .smooth_for (item )
179+ return self .dictionary [item ] / self .n_obs
180+
181+ # (top() and sample() are not used in this module, but elsewhere.)
182+
183+ def top (self , n ):
184+ "Return (count, obs) tuples for the n most frequent observations."
185+ return heapq .nlargest (n , [(v , k ) for (k , v ) in self .dictionary .items ()])
186+
187+ def sample (self ):
188+ "Return a random sample from the distribution."
189+ if self .sampler is None :
190+ self .sampler = weighted_sampler (self .dictionary .keys (),
191+ self .dictionary .values ())
192+ return self .sampler ()
193+
194+ #______________________________________________________________________________
195+
145196def PluralityLearner (dataset ):
146197 """A very dumb algorithm: always pick the result that was most popular
147198 in the training data. Makes a baseline for comparison."""
@@ -154,48 +205,29 @@ def predict(example):
154205#______________________________________________________________________________
155206
156207def NaiveBayesLearner (dataset ):
157- """Just count the target/attr/val occurrences.
158- Count how many times each value of each input attribute occurs.
159- Store count in _N[targetvalue][attr][val]. Let
160- _N[targetvalue][attr][None] be the sum over all vals."""
161-
162- _N = {}
163- ## Initialize to 0
164- for gv in dataset .values [dataset .target ]:
165- _N [gv ] = {}
166- for attr in dataset .inputs :
167- _N [gv ][attr ] = {}
168- assert None not in dataset .values [attr ]
169- for val in dataset .values [attr ]:
170- _N [gv ][attr ][val ] = 0
171- _N [gv ][attr ][None ] = 0
172- ## Go thru examples
208+ """Just count how many times each value of each input attribute
209+ occurs, conditional on the target value. Count the different
210+ target values too."""
211+
212+ targetvals = dataset .values [dataset .target ]
213+ target_dist = CountingProbDist (targetvals )
214+ attr_dists = dict (((gv , attr ), CountingProbDist (dataset .values [attr ]))
215+ for gv in targetvals
216+ for attr in dataset .inputs )
173217 for example in dataset .examples :
174- Ngv = _N [example [dataset .target ]]
218+ targetval = example [dataset .target ]
219+ target_dist .add (targetval )
175220 for attr in dataset .inputs :
176- Ngv [attr ][example [attr ]] += 1
177- Ngv [attr ][None ] += 1
221+ attr_dists [targetval , attr ].add (example [attr ])
178222
179223 def predict (example ):
180224 """Predict the target value for example. Consider each possible value,
181- choose the most likely, by looking at each attribute independently."""
182- possible_values = dataset .values [dataset .target ]
225+ and pick the most likely by looking at each attribute independently."""
183226 def class_probability (targetval ):
184- return product (P (targetval , a , example [a ]) for a in dataset .inputs )
185- return argmax (possible_values , class_probability )
186-
187- def P (targetval , attr , attrval ):
188- """Smooth the raw counts to give a probability estimate.
189- Estimate adds 1 to numerator and len(possible vals) to denominator."""
190- return ((N (targetval , attr , attrval ) + 1.0 ) /
191- (N (targetval , attr , None ) + len (dataset .values [attr ])))
192-
193- def N (targetval , attr , attrval ):
194- "Return the count in the training data of this combination."
195- try :
196- return _N [targetval ][attr ][attrval ]
197- except KeyError :
198- return 0
227+ return (target_dist [targetval ]
228+ * product (attr_dists [targetval , attr ][example [attr ]]
229+ for attr in dataset .inputs ))
230+ return argmax (targetvals , class_probability )
199231
200232 return predict
201233
0 commit comments