@@ -18,7 +18,7 @@ class UnigramTextModel(CountingProbDist):
1818
1919    def  samples (self , n ):
2020        "Return a string of n words, random according to the model." 
21-         return  ' ' .join ([ self .sample () for  i  in  range (n )] )
21+         return  ' ' .join (self .sample () for  i  in  range (n ))
2222
2323class  NgramTextModel (CountingProbDist ):
2424    """This is a discrete probability distribution over n-tuples of words. 
@@ -93,6 +93,7 @@ def viterbi_segment(text, P):
9393#______________________________________________________________________________ 
9494
9595
96+ # TODO(tmrts): Expose raw index 
9697class  IRSystem :
9798    """A very simple Information Retrieval System, as discussed in Sect. 23.2. 
9899    The constructor s = IRSystem('the a') builds an empty system with two 
@@ -149,8 +150,7 @@ def present(self, results):
149150        "Present the results as a list." 
150151        for  (score , d ) in  results :
151152            doc  =  self .documents [d ]
152-             print  ("%5.2f|%25s | %s" 
153-                    %  (100  *  score , doc .url , doc .title [:45 ].expandtabs ()))
153+             print  ("{:5.2}|{:25} | {}" .format (100  *  score , doc .url , doc .title [:45 ].expandtabs ()))
154154
155155    def  present_results (self , query_text , n = 10 ):
156156        "Get results for the query and present them." 
@@ -161,7 +161,7 @@ class UnixConsultant(IRSystem):
161161    def  __init__ (self ):
162162        IRSystem .__init__ (self , stopwords = "how do i the a of" )
163163        import  os 
164-         mandir  =  '../data/MAN/' 
164+         mandir  =  '../aima- data/MAN/' 
165165        man_files  =  [mandir  +  f  for  f  in  os .listdir (mandir )
166166                     if  f .endswith ('.txt' )]
167167        self .index_collection (man_files )
@@ -194,6 +194,8 @@ def canonicalize(text):
194194## A shift cipher is a rotation of the letters in the alphabet, 
195195## such as the famous rot13, which maps A to N, B to M, etc. 
196196
197+ alphabet  =  'abcdefghijklmnopqrstuvwxyz' 
198+ 
197199#### Encoding 
198200
199201def  shift_encode (plaintext , n ):
@@ -216,9 +218,8 @@ def encode(plaintext, code):
216218    "Encodes text, using a code which is a permutation of the alphabet." 
217219    from  string  import  maketrans 
218220    trans  =  maketrans (alphabet  +  alphabet .upper (), code  +  code .upper ())
219-     return  plaintext .translate (trans )
220221
221- alphabet   =   'abcdefghijklmnopqrstuvwxyz' 
222+      return   plaintext . translate ( trans ) 
222223
223224def  bigrams (text ):
224225    """Return a list of pairs in text (a sequence of letters or words). 
@@ -241,18 +242,22 @@ def __init__(self, training_text):
241242
242243    def  score (self , plaintext ):
243244        "Return a score for text based on how common letters pairs are." 
245+ 
244246        s  =  1.0 
245247        for  bi  in  bigrams (plaintext ):
246248            s  =  s  *  self .P2 [bi ]
249+ 
247250        return  s 
248251
249252    def  decode (self , ciphertext ):
250253        "Return the shift decoding of text with the best score." 
251-         return  argmax (all_shifts (ciphertext ), self .score )
254+ 
255+         return  max (all_shifts (ciphertext ), self .score )
252256
253257def  all_shifts (text ):
254258    "Return a list of all 26 possible encodings of text by a shift cipher." 
255-     return  [shift_encode (text , n ) for  n  in  range (len (alphabet ))]
259+ 
260+     yield  from  (shift_encode (text , i ) for  i , _  in  enumerate (alphabet ))
256261
257262#### Decoding a General Permutation Cipher 
258263
@@ -309,61 +314,7 @@ def goal_test(self, state):
309314
310315#______________________________________________________________________________ 
311316
312- __doc__  +=  """ 
313- ## Create a Unigram text model from the words in the book "Flatland". 
314- >>> flatland = DataFile("EN-text/flatland.txt").read() 
315- >>> wordseq = words(flatland) 
316- >>> P = UnigramTextModel(wordseq) 
317- 
318- ## Now do segmentation, using the text model as a prior. 
319- >>> s, p = viterbi_segment('itiseasytoreadwordswithoutspaces', P) 
320- >>> s 
321- ['it', 'is', 'easy', 'to', 'read', 'words', 'without', 'spaces'] 
322- >>> 1e-30 < p < 1e-20 
323- True 
324- >>> s, p = viterbi_segment('wheninthecourseofhumaneventsitbecomesnecessary', P) 
325- >>> s 
326- ['when', 'in', 'the', 'course', 'of', 'human', 'events', 'it', 'becomes', 'necessary'] 
327- 
328- ## Test the decoding system 
329- >>> shift_encode("This is a secret message.", 17) 
330- 'Kyzj zj r jvtivk dvjjrxv.' 
331- 
332- >>> ring = ShiftDecoder(flatland) 
333- >>> ring.decode('Kyzj zj r jvtivk dvjjrxv.') 
334- 'This is a secret message.' 
335- >>> ring.decode(rot13('Hello, world!')) 
336- 'Hello, world!' 
337- 
338- ## CountingProbDist 
339- ## Add a thousand samples of a roll of a die to D. 
340- >>> D = CountingProbDist() 
341- >>> for i in range(10000): 
342- ...     D.add(random.choice('123456')) 
343- >>> ps = [D[n] for n in '123456'] 
344- >>> 1./7. <= min(ps) <= max(ps) <= 1./5. 
345- True 
346- """ 
347- 
348- __doc__  +=  (""" 
349- ## Compare 1-, 2-, and 3-gram word models of the same text. 
350- >>> flatland = DataFile("EN-text/flatland.txt").read() 
351- >>> wordseq = words(flatland) 
352- >>> P1 = UnigramTextModel(wordseq) 
353- >>> P2 = NgramTextModel(2, wordseq) 
354- >>> P3 = NgramTextModel(3, wordseq) 
355- 
356- ## The most frequent entries in each model 
357- >>> P1.top(10) 
358- [(2081, 'the'), (1479, 'of'), (1021, 'and'), (1008, 'to'), (850, 'a'), (722, 'i'), (640, 'in'), (478, 'that'), (399, 'is'), (348, 'you')] 
359- 
360- >>> P2.top(10) 
361- [(368, ('of', 'the')), (152, ('to', 'the')), (152, ('in', 'the')), (86, ('of', 'a')), (80, ('it', 'is')), (71, ('by', 'the')), (68, ('for', 'the')), (68, ('and', 'the')), (62, ('on', 'the')), (60, ('to', 'be'))] 
362- 
363- >>> P3.top(10) 
364- [(30, ('a', 'straight', 'line')), (19, ('of', 'three', 'dimensions')), (16, ('the', 'sense', 'of')), (13, ('by', 'the', 'sense')), (13, ('as', 'well', 'as')), (12, ('of', 'the', 'circles')), (12, ('of', 'sight', 'recognition')), (11, ('the', 'number', 'of')), (11, ('that', 'i', 'had')), (11, ('so', 'as', 'to'))] 
365- """ )
366- 
317+ # TODO(tmrts): Set RNG seed to test random functions 
367318__doc__  +=  random_tests (""" 
368319## Generate random text from the N-gram models 
369320>>> P1.samples(20) 
@@ -375,66 +326,3 @@ def goal_test(self, state):
375326>>> P3.samples(20) 
376327'flatland by edwin a abbott 1884 to the wake of a certificate from nature herself proving the equal sided triangle' 
377328""" )
378- __doc__  +=  """ 
379- 
380- ## Probabilities of some common n-grams 
381- >>> P1['the']               #doctest:+ELLIPSIS 
382- 0.0611... 
383- 
384- >>> P2[('of', 'the')]       #doctest:+ELLIPSIS 
385- 0.0108... 
386- 
387- >>> P3[('', '', 'but')] 
388- 0.0 
389- 
390- >>> P3[('so', 'as', 'to')]  #doctest:+ELLIPSIS 
391- 0.000323... 
392- 
393- ## Distributions given the previous n-1 words 
394- >>> P2.cond_prob['went',].dictionary 
395- {} 
396- >>> P3.cond_prob['in', 'order'].dictionary 
397- {'to': 6} 
398- 
399- 
400- ## Build and test an IR System 
401- >>> uc = UnixConsultant() 
402- >>> uc.present_results("how do I remove a file") 
403- 76.83|       ../data/MAN/rm.txt | RM(1)                          FSF                          RM(1) 
404- 67.83|      ../data/MAN/tar.txt | TAR(1)                                                                  TAR(1) 
405- 67.79|       ../data/MAN/cp.txt | CP(1)                          FSF                          CP(1) 
406- 66.58|      ../data/MAN/zip.txt | ZIP(1L)                                                   ZIP(1L) 
407- 64.58|     ../data/MAN/gzip.txt | GZIP(1)                                                                GZIP(1) 
408- 63.74|     ../data/MAN/pine.txt | pine(1)                                                   pine(1) 
409- 62.95|    ../data/MAN/shred.txt | SHRED(1)                       FSF                       SHRED(1) 
410- 57.46|     ../data/MAN/pico.txt | pico(1)                                                   pico(1) 
411- 43.38|    ../data/MAN/login.txt | LOGIN(1)                   Linux Programmer's Manual                  
412- 41.93|       ../data/MAN/ln.txt | LN(1)                          FSF                          LN(1) 
413- 
414- >>> uc.present_results("how do I delete a file") 
415- 75.47|     ../data/MAN/diff.txt | DIFF(1)                            GNU Tools                           DIFF(1) 
416- 69.12|     ../data/MAN/pine.txt | pine(1)                                                   pine(1) 
417- 63.56|      ../data/MAN/tar.txt | TAR(1)                                                                  TAR(1) 
418- 60.63|      ../data/MAN/zip.txt | ZIP(1L)                                                   ZIP(1L) 
419- 57.46|     ../data/MAN/pico.txt | pico(1)                                                   pico(1) 
420- 51.28|    ../data/MAN/shred.txt | SHRED(1)                       FSF                       SHRED(1) 
421- 26.72|       ../data/MAN/tr.txt | TR(1)                     User Commands                     TR(1) 
422- 
423- >>> uc.present_results("email") 
424- 18.39|     ../data/MAN/pine.txt | pine(1)                                                   pine(1) 
425- 12.01|     ../data/MAN/info.txt | INFO(1)                        FSF                        INFO(1) 
426-  9.89|     ../data/MAN/pico.txt | pico(1)                                                   pico(1) 
427-  8.73|     ../data/MAN/grep.txt | GREP(1)                                                                GREP(1) 
428-  8.07|      ../data/MAN/zip.txt | ZIP(1L)                                                   ZIP(1L) 
429- 
430- >>> uc.present_results("word counts for files") 
431- 112.38|     ../data/MAN/grep.txt | GREP(1)                                                                GREP(1) 
432- 101.84|       ../data/MAN/wc.txt | WC(1)                     User Commands                     WC(1) 
433- 82.46|     ../data/MAN/find.txt | FIND(1L)                                                              FIND(1L) 
434- 74.64|       ../data/MAN/du.txt | DU(1)                          FSF                          DU(1) 
435- 
436- >>> uc.present_results("learn: date") 
437- >>> uc.present_results("2003") 
438- 14.58|     ../data/MAN/pine.txt | pine(1)                                                   pine(1) 
439- 11.62|      ../data/MAN/jar.txt | FASTJAR(1)                            GNU                           FASTJAR(1) 
440- """ 
0 commit comments