@@ -18,7 +18,7 @@ class UnigramTextModel(CountingProbDist):
1818
1919 def samples (self , n ):
2020 "Return a string of n words, random according to the model."
21- return ' ' .join ([ self .sample () for i in range (n )] )
21+ return ' ' .join (self .sample () for i in range (n ))
2222
2323class NgramTextModel (CountingProbDist ):
2424 """This is a discrete probability distribution over n-tuples of words.
@@ -93,6 +93,7 @@ def viterbi_segment(text, P):
9393#______________________________________________________________________________
9494
9595
96+ # TODO(tmrts): Expose raw index
9697class IRSystem :
9798 """A very simple Information Retrieval System, as discussed in Sect. 23.2.
9899 The constructor s = IRSystem('the a') builds an empty system with two
@@ -149,8 +150,7 @@ def present(self, results):
149150 "Present the results as a list."
150151 for (score , d ) in results :
151152 doc = self .documents [d ]
152- print ("%5.2f|%25s | %s"
153- % (100 * score , doc .url , doc .title [:45 ].expandtabs ()))
153+ print ("{:5.2}|{:25} | {}" .format (100 * score , doc .url , doc .title [:45 ].expandtabs ()))
154154
155155 def present_results (self , query_text , n = 10 ):
156156 "Get results for the query and present them."
@@ -161,7 +161,7 @@ class UnixConsultant(IRSystem):
161161 def __init__ (self ):
162162 IRSystem .__init__ (self , stopwords = "how do i the a of" )
163163 import os
164- mandir = '../data/MAN/'
164+ mandir = '../aima- data/MAN/'
165165 man_files = [mandir + f for f in os .listdir (mandir )
166166 if f .endswith ('.txt' )]
167167 self .index_collection (man_files )
@@ -194,6 +194,8 @@ def canonicalize(text):
194194## A shift cipher is a rotation of the letters in the alphabet,
195195## such as the famous rot13, which maps A to N, B to M, etc.
196196
197+ alphabet = 'abcdefghijklmnopqrstuvwxyz'
198+
197199#### Encoding
198200
199201def shift_encode (plaintext , n ):
@@ -216,9 +218,8 @@ def encode(plaintext, code):
216218 "Encodes text, using a code which is a permutation of the alphabet."
217219 from string import maketrans
218220 trans = maketrans (alphabet + alphabet .upper (), code + code .upper ())
219- return plaintext .translate (trans )
220221
221- alphabet = 'abcdefghijklmnopqrstuvwxyz'
222+ return plaintext . translate ( trans )
222223
223224def bigrams (text ):
224225 """Return a list of pairs in text (a sequence of letters or words).
@@ -241,18 +242,22 @@ def __init__(self, training_text):
241242
242243 def score (self , plaintext ):
243244 "Return a score for text based on how common letters pairs are."
245+
244246 s = 1.0
245247 for bi in bigrams (plaintext ):
246248 s = s * self .P2 [bi ]
249+
247250 return s
248251
249252 def decode (self , ciphertext ):
250253 "Return the shift decoding of text with the best score."
251- return argmax (all_shifts (ciphertext ), self .score )
254+
255+ return max (all_shifts (ciphertext ), self .score )
252256
253257def all_shifts (text ):
254258 "Return a list of all 26 possible encodings of text by a shift cipher."
255- return [shift_encode (text , n ) for n in range (len (alphabet ))]
259+
260+ yield from (shift_encode (text , i ) for i , _ in enumerate (alphabet ))
256261
257262#### Decoding a General Permutation Cipher
258263
@@ -309,61 +314,7 @@ def goal_test(self, state):
309314
310315#______________________________________________________________________________
311316
312- __doc__ += """
313- ## Create a Unigram text model from the words in the book "Flatland".
314- >>> flatland = DataFile("EN-text/flatland.txt").read()
315- >>> wordseq = words(flatland)
316- >>> P = UnigramTextModel(wordseq)
317-
318- ## Now do segmentation, using the text model as a prior.
319- >>> s, p = viterbi_segment('itiseasytoreadwordswithoutspaces', P)
320- >>> s
321- ['it', 'is', 'easy', 'to', 'read', 'words', 'without', 'spaces']
322- >>> 1e-30 < p < 1e-20
323- True
324- >>> s, p = viterbi_segment('wheninthecourseofhumaneventsitbecomesnecessary', P)
325- >>> s
326- ['when', 'in', 'the', 'course', 'of', 'human', 'events', 'it', 'becomes', 'necessary']
327-
328- ## Test the decoding system
329- >>> shift_encode("This is a secret message.", 17)
330- 'Kyzj zj r jvtivk dvjjrxv.'
331-
332- >>> ring = ShiftDecoder(flatland)
333- >>> ring.decode('Kyzj zj r jvtivk dvjjrxv.')
334- 'This is a secret message.'
335- >>> ring.decode(rot13('Hello, world!'))
336- 'Hello, world!'
337-
338- ## CountingProbDist
339- ## Add a thousand samples of a roll of a die to D.
340- >>> D = CountingProbDist()
341- >>> for i in range(10000):
342- ... D.add(random.choice('123456'))
343- >>> ps = [D[n] for n in '123456']
344- >>> 1./7. <= min(ps) <= max(ps) <= 1./5.
345- True
346- """
347-
348- __doc__ += ("""
349- ## Compare 1-, 2-, and 3-gram word models of the same text.
350- >>> flatland = DataFile("EN-text/flatland.txt").read()
351- >>> wordseq = words(flatland)
352- >>> P1 = UnigramTextModel(wordseq)
353- >>> P2 = NgramTextModel(2, wordseq)
354- >>> P3 = NgramTextModel(3, wordseq)
355-
356- ## The most frequent entries in each model
357- >>> P1.top(10)
358- [(2081, 'the'), (1479, 'of'), (1021, 'and'), (1008, 'to'), (850, 'a'), (722, 'i'), (640, 'in'), (478, 'that'), (399, 'is'), (348, 'you')]
359-
360- >>> P2.top(10)
361- [(368, ('of', 'the')), (152, ('to', 'the')), (152, ('in', 'the')), (86, ('of', 'a')), (80, ('it', 'is')), (71, ('by', 'the')), (68, ('for', 'the')), (68, ('and', 'the')), (62, ('on', 'the')), (60, ('to', 'be'))]
362-
363- >>> P3.top(10)
364- [(30, ('a', 'straight', 'line')), (19, ('of', 'three', 'dimensions')), (16, ('the', 'sense', 'of')), (13, ('by', 'the', 'sense')), (13, ('as', 'well', 'as')), (12, ('of', 'the', 'circles')), (12, ('of', 'sight', 'recognition')), (11, ('the', 'number', 'of')), (11, ('that', 'i', 'had')), (11, ('so', 'as', 'to'))]
365- """ )
366-
317+ # TODO(tmrts): Set RNG seed to test random functions
367318__doc__ += random_tests ("""
368319## Generate random text from the N-gram models
369320>>> P1.samples(20)
@@ -375,66 +326,3 @@ def goal_test(self, state):
375326>>> P3.samples(20)
376327'flatland by edwin a abbott 1884 to the wake of a certificate from nature herself proving the equal sided triangle'
377328""" )
378- __doc__ += """
379-
380- ## Probabilities of some common n-grams
381- >>> P1['the'] #doctest:+ELLIPSIS
382- 0.0611...
383-
384- >>> P2[('of', 'the')] #doctest:+ELLIPSIS
385- 0.0108...
386-
387- >>> P3[('', '', 'but')]
388- 0.0
389-
390- >>> P3[('so', 'as', 'to')] #doctest:+ELLIPSIS
391- 0.000323...
392-
393- ## Distributions given the previous n-1 words
394- >>> P2.cond_prob['went',].dictionary
395- {}
396- >>> P3.cond_prob['in', 'order'].dictionary
397- {'to': 6}
398-
399-
400- ## Build and test an IR System
401- >>> uc = UnixConsultant()
402- >>> uc.present_results("how do I remove a file")
403- 76.83| ../data/MAN/rm.txt | RM(1) FSF RM(1)
404- 67.83| ../data/MAN/tar.txt | TAR(1) TAR(1)
405- 67.79| ../data/MAN/cp.txt | CP(1) FSF CP(1)
406- 66.58| ../data/MAN/zip.txt | ZIP(1L) ZIP(1L)
407- 64.58| ../data/MAN/gzip.txt | GZIP(1) GZIP(1)
408- 63.74| ../data/MAN/pine.txt | pine(1) pine(1)
409- 62.95| ../data/MAN/shred.txt | SHRED(1) FSF SHRED(1)
410- 57.46| ../data/MAN/pico.txt | pico(1) pico(1)
411- 43.38| ../data/MAN/login.txt | LOGIN(1) Linux Programmer's Manual
412- 41.93| ../data/MAN/ln.txt | LN(1) FSF LN(1)
413-
414- >>> uc.present_results("how do I delete a file")
415- 75.47| ../data/MAN/diff.txt | DIFF(1) GNU Tools DIFF(1)
416- 69.12| ../data/MAN/pine.txt | pine(1) pine(1)
417- 63.56| ../data/MAN/tar.txt | TAR(1) TAR(1)
418- 60.63| ../data/MAN/zip.txt | ZIP(1L) ZIP(1L)
419- 57.46| ../data/MAN/pico.txt | pico(1) pico(1)
420- 51.28| ../data/MAN/shred.txt | SHRED(1) FSF SHRED(1)
421- 26.72| ../data/MAN/tr.txt | TR(1) User Commands TR(1)
422-
423- >>> uc.present_results("email")
424- 18.39| ../data/MAN/pine.txt | pine(1) pine(1)
425- 12.01| ../data/MAN/info.txt | INFO(1) FSF INFO(1)
426- 9.89| ../data/MAN/pico.txt | pico(1) pico(1)
427- 8.73| ../data/MAN/grep.txt | GREP(1) GREP(1)
428- 8.07| ../data/MAN/zip.txt | ZIP(1L) ZIP(1L)
429-
430- >>> uc.present_results("word counts for files")
431- 112.38| ../data/MAN/grep.txt | GREP(1) GREP(1)
432- 101.84| ../data/MAN/wc.txt | WC(1) User Commands WC(1)
433- 82.46| ../data/MAN/find.txt | FIND(1L) FIND(1L)
434- 74.64| ../data/MAN/du.txt | DU(1) FSF DU(1)
435-
436- >>> uc.present_results("learn: date")
437- >>> uc.present_results("2003")
438- 14.58| ../data/MAN/pine.txt | pine(1) pine(1)
439- 11.62| ../data/MAN/jar.txt | FASTJAR(1) GNU FASTJAR(1)
440- """
0 commit comments