Skip to content

Commit b36947c

Browse files
committed
Merge pull request aimacode#88 from tmrts/port_module/text
Port and test text module for python 3.5
2 parents 72fb122 + e68df0b commit b36947c

File tree

2 files changed

+160
-126
lines changed

2 files changed

+160
-126
lines changed

text.py

Lines changed: 14 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ class UnigramTextModel(CountingProbDist):
1818

1919
def samples(self, n):
2020
"Return a string of n words, random according to the model."
21-
return ' '.join([self.sample() for i in range(n)])
21+
return ' '.join(self.sample() for i in range(n))
2222

2323
class NgramTextModel(CountingProbDist):
2424
"""This is a discrete probability distribution over n-tuples of words.
@@ -93,6 +93,7 @@ def viterbi_segment(text, P):
9393
#______________________________________________________________________________
9494

9595

96+
# TODO(tmrts): Expose raw index
9697
class IRSystem:
9798
"""A very simple Information Retrieval System, as discussed in Sect. 23.2.
9899
The constructor s = IRSystem('the a') builds an empty system with two
@@ -149,8 +150,7 @@ def present(self, results):
149150
"Present the results as a list."
150151
for (score, d) in results:
151152
doc = self.documents[d]
152-
print ("%5.2f|%25s | %s"
153-
% (100 * score, doc.url, doc.title[:45].expandtabs()))
153+
print ("{:5.2}|{:25} | {}".format(100 * score, doc.url, doc.title[:45].expandtabs()))
154154

155155
def present_results(self, query_text, n=10):
156156
"Get results for the query and present them."
@@ -161,7 +161,7 @@ class UnixConsultant(IRSystem):
161161
def __init__(self):
162162
IRSystem.__init__(self, stopwords="how do i the a of")
163163
import os
164-
mandir = '../data/MAN/'
164+
mandir = '../aima-data/MAN/'
165165
man_files = [mandir + f for f in os.listdir(mandir)
166166
if f.endswith('.txt')]
167167
self.index_collection(man_files)
@@ -194,6 +194,8 @@ def canonicalize(text):
194194
## A shift cipher is a rotation of the letters in the alphabet,
195195
## such as the famous rot13, which maps A to N, B to M, etc.
196196

197+
alphabet = 'abcdefghijklmnopqrstuvwxyz'
198+
197199
#### Encoding
198200

199201
def shift_encode(plaintext, n):
@@ -216,9 +218,8 @@ def encode(plaintext, code):
216218
"Encodes text, using a code which is a permutation of the alphabet."
217219
from string import maketrans
218220
trans = maketrans(alphabet + alphabet.upper(), code + code.upper())
219-
return plaintext.translate(trans)
220221

221-
alphabet = 'abcdefghijklmnopqrstuvwxyz'
222+
return plaintext.translate(trans)
222223

223224
def bigrams(text):
224225
"""Return a list of pairs in text (a sequence of letters or words).
@@ -241,18 +242,22 @@ def __init__(self, training_text):
241242

242243
def score(self, plaintext):
243244
"Return a score for text based on how common letters pairs are."
245+
244246
s = 1.0
245247
for bi in bigrams(plaintext):
246248
s = s * self.P2[bi]
249+
247250
return s
248251

249252
def decode(self, ciphertext):
250253
"Return the shift decoding of text with the best score."
251-
return argmax(all_shifts(ciphertext), self.score)
254+
255+
return max(all_shifts(ciphertext), self.score)
252256

253257
def all_shifts(text):
254258
"Return a list of all 26 possible encodings of text by a shift cipher."
255-
return [shift_encode(text, n) for n in range(len(alphabet))]
259+
260+
yield from (shift_encode(text, i) for i, _ in enumerate(alphabet))
256261

257262
#### Decoding a General Permutation Cipher
258263

@@ -309,61 +314,7 @@ def goal_test(self, state):
309314

310315
#______________________________________________________________________________
311316

312-
__doc__ += """
313-
## Create a Unigram text model from the words in the book "Flatland".
314-
>>> flatland = DataFile("EN-text/flatland.txt").read()
315-
>>> wordseq = words(flatland)
316-
>>> P = UnigramTextModel(wordseq)
317-
318-
## Now do segmentation, using the text model as a prior.
319-
>>> s, p = viterbi_segment('itiseasytoreadwordswithoutspaces', P)
320-
>>> s
321-
['it', 'is', 'easy', 'to', 'read', 'words', 'without', 'spaces']
322-
>>> 1e-30 < p < 1e-20
323-
True
324-
>>> s, p = viterbi_segment('wheninthecourseofhumaneventsitbecomesnecessary', P)
325-
>>> s
326-
['when', 'in', 'the', 'course', 'of', 'human', 'events', 'it', 'becomes', 'necessary']
327-
328-
## Test the decoding system
329-
>>> shift_encode("This is a secret message.", 17)
330-
'Kyzj zj r jvtivk dvjjrxv.'
331-
332-
>>> ring = ShiftDecoder(flatland)
333-
>>> ring.decode('Kyzj zj r jvtivk dvjjrxv.')
334-
'This is a secret message.'
335-
>>> ring.decode(rot13('Hello, world!'))
336-
'Hello, world!'
337-
338-
## CountingProbDist
339-
## Add a thousand samples of a roll of a die to D.
340-
>>> D = CountingProbDist()
341-
>>> for i in range(10000):
342-
... D.add(random.choice('123456'))
343-
>>> ps = [D[n] for n in '123456']
344-
>>> 1./7. <= min(ps) <= max(ps) <= 1./5.
345-
True
346-
"""
347-
348-
__doc__ += ("""
349-
## Compare 1-, 2-, and 3-gram word models of the same text.
350-
>>> flatland = DataFile("EN-text/flatland.txt").read()
351-
>>> wordseq = words(flatland)
352-
>>> P1 = UnigramTextModel(wordseq)
353-
>>> P2 = NgramTextModel(2, wordseq)
354-
>>> P3 = NgramTextModel(3, wordseq)
355-
356-
## The most frequent entries in each model
357-
>>> P1.top(10)
358-
[(2081, 'the'), (1479, 'of'), (1021, 'and'), (1008, 'to'), (850, 'a'), (722, 'i'), (640, 'in'), (478, 'that'), (399, 'is'), (348, 'you')]
359-
360-
>>> P2.top(10)
361-
[(368, ('of', 'the')), (152, ('to', 'the')), (152, ('in', 'the')), (86, ('of', 'a')), (80, ('it', 'is')), (71, ('by', 'the')), (68, ('for', 'the')), (68, ('and', 'the')), (62, ('on', 'the')), (60, ('to', 'be'))]
362-
363-
>>> P3.top(10)
364-
[(30, ('a', 'straight', 'line')), (19, ('of', 'three', 'dimensions')), (16, ('the', 'sense', 'of')), (13, ('by', 'the', 'sense')), (13, ('as', 'well', 'as')), (12, ('of', 'the', 'circles')), (12, ('of', 'sight', 'recognition')), (11, ('the', 'number', 'of')), (11, ('that', 'i', 'had')), (11, ('so', 'as', 'to'))]
365-
""")
366-
317+
# TODO(tmrts): Set RNG seed to test random functions
367318
__doc__ += random_tests("""
368319
## Generate random text from the N-gram models
369320
>>> P1.samples(20)
@@ -375,66 +326,3 @@ def goal_test(self, state):
375326
>>> P3.samples(20)
376327
'flatland by edwin a abbott 1884 to the wake of a certificate from nature herself proving the equal sided triangle'
377328
""")
378-
__doc__ += """
379-
380-
## Probabilities of some common n-grams
381-
>>> P1['the'] #doctest:+ELLIPSIS
382-
0.0611...
383-
384-
>>> P2[('of', 'the')] #doctest:+ELLIPSIS
385-
0.0108...
386-
387-
>>> P3[('', '', 'but')]
388-
0.0
389-
390-
>>> P3[('so', 'as', 'to')] #doctest:+ELLIPSIS
391-
0.000323...
392-
393-
## Distributions given the previous n-1 words
394-
>>> P2.cond_prob['went',].dictionary
395-
{}
396-
>>> P3.cond_prob['in', 'order'].dictionary
397-
{'to': 6}
398-
399-
400-
## Build and test an IR System
401-
>>> uc = UnixConsultant()
402-
>>> uc.present_results("how do I remove a file")
403-
76.83| ../data/MAN/rm.txt | RM(1) FSF RM(1)
404-
67.83| ../data/MAN/tar.txt | TAR(1) TAR(1)
405-
67.79| ../data/MAN/cp.txt | CP(1) FSF CP(1)
406-
66.58| ../data/MAN/zip.txt | ZIP(1L) ZIP(1L)
407-
64.58| ../data/MAN/gzip.txt | GZIP(1) GZIP(1)
408-
63.74| ../data/MAN/pine.txt | pine(1) pine(1)
409-
62.95| ../data/MAN/shred.txt | SHRED(1) FSF SHRED(1)
410-
57.46| ../data/MAN/pico.txt | pico(1) pico(1)
411-
43.38| ../data/MAN/login.txt | LOGIN(1) Linux Programmer's Manual
412-
41.93| ../data/MAN/ln.txt | LN(1) FSF LN(1)
413-
414-
>>> uc.present_results("how do I delete a file")
415-
75.47| ../data/MAN/diff.txt | DIFF(1) GNU Tools DIFF(1)
416-
69.12| ../data/MAN/pine.txt | pine(1) pine(1)
417-
63.56| ../data/MAN/tar.txt | TAR(1) TAR(1)
418-
60.63| ../data/MAN/zip.txt | ZIP(1L) ZIP(1L)
419-
57.46| ../data/MAN/pico.txt | pico(1) pico(1)
420-
51.28| ../data/MAN/shred.txt | SHRED(1) FSF SHRED(1)
421-
26.72| ../data/MAN/tr.txt | TR(1) User Commands TR(1)
422-
423-
>>> uc.present_results("email")
424-
18.39| ../data/MAN/pine.txt | pine(1) pine(1)
425-
12.01| ../data/MAN/info.txt | INFO(1) FSF INFO(1)
426-
9.89| ../data/MAN/pico.txt | pico(1) pico(1)
427-
8.73| ../data/MAN/grep.txt | GREP(1) GREP(1)
428-
8.07| ../data/MAN/zip.txt | ZIP(1L) ZIP(1L)
429-
430-
>>> uc.present_results("word counts for files")
431-
112.38| ../data/MAN/grep.txt | GREP(1) GREP(1)
432-
101.84| ../data/MAN/wc.txt | WC(1) User Commands WC(1)
433-
82.46| ../data/MAN/find.txt | FIND(1L) FIND(1L)
434-
74.64| ../data/MAN/du.txt | DU(1) FSF DU(1)
435-
436-
>>> uc.present_results("learn: date")
437-
>>> uc.present_results("2003")
438-
14.58| ../data/MAN/pine.txt | pine(1) pine(1)
439-
11.62| ../data/MAN/jar.txt | FASTJAR(1) GNU FASTJAR(1)
440-
"""

text_test.py

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
import pytest
2+
3+
from text import *
4+
5+
from random import choice
6+
from math import isclose
7+
8+
def test_unigram_text_model():
9+
flatland = DataFile("aima-data/EN-text/flatland.txt").read()
10+
wordseq = words(flatland)
11+
P = UnigramTextModel(wordseq)
12+
13+
s, p = viterbi_segment('itiseasytoreadwordswithoutspaces', P)
14+
15+
assert s == ['it', 'is', 'easy', 'to', 'read', 'words', 'without', 'spaces']
16+
17+
def test_shift_encoding():
18+
code = shift_encode("This is a secret message.", 17)
19+
20+
assert code == 'Kyzj zj r jvtivk dvjjrxv.'
21+
22+
def test_shift_decoding():
23+
code = shift_encode("This is a secret message.", 17)
24+
25+
ring = ShiftDecoder(flatland)
26+
msg = ring.decode('Kyzj zj r jvtivk dvjjrxv.')
27+
28+
assert msg == 'This is a secret message.'
29+
30+
def test_rot13_decoding():
31+
msg = ring.decode(rot13('Hello, world!'))
32+
33+
assert msg == 'Hello, world!'
34+
35+
def test_counting_probability_distribution():
36+
D = CountingProbDist()
37+
38+
for i in range(10000):
39+
D.add(random.choice('123456'))
40+
41+
ps = [D[n] for n in '123456']
42+
43+
assert 1/7 <= min(ps) <= max(ps) <= 1/5
44+
45+
def test_ngram_models():
46+
flatland = DataFile("aima-data/EN-text/flatland.txt").read()
47+
wordseq = words(flatland)
48+
P1 = UnigramTextModel(wordseq)
49+
P2 = NgramTextModel(2, wordseq)
50+
P3 = NgramTextModel(3, wordseq)
51+
52+
## The most frequent entries in each model
53+
assert P1.top(10) == [(2081, 'the'), (1479, 'of'), (1021, 'and'), (1008, 'to'), (850, 'a'),
54+
(722, 'i'), (640, 'in'), (478, 'that'), (399, 'is'), (348, 'you')]
55+
56+
assert P2.top(10) == [(368, ('of', 'the')), (152, ('to', 'the')), (152, ('in', 'the')), (86, ('of', 'a')),
57+
(80, ('it', 'is' )), (71, ('by', 'the' )), (68, ('for', 'the' )),
58+
(68, ('and', 'the' )), (62, ('on', 'the' )), (60, ('to', 'be'))]
59+
60+
assert P3.top(10) == [(30, ('a', 'straight', 'line')), (19, ('of', 'three', 'dimensions')),
61+
(16, ('the', 'sense', 'of' )), (13, ('by', 'the', 'sense' )),
62+
(13, ('as', 'well', 'as' )), (12, ('of', 'the', 'circles' )),
63+
(12, ('of', 'sight', 'recognition' )), (11, ('the', 'number', 'of' )),
64+
(11, ('that', 'i', 'had' )), (11, ('so', 'as', 'to'))]
65+
66+
67+
assert isclose(P1['the'], 0.0611)
68+
69+
assert isclose(P2['of', 'the'], 0.0108)
70+
71+
assert isclose(P3['', '', 'but'], 0.0)
72+
assert isclose(P3['', '', 'but'], 0.0)
73+
assert isclose(P3['so', 'as', 'to'], 0.000323)
74+
75+
assert not P2.cond_prob['went',].dictionary
76+
77+
assert P3.cond_prob['in','order'].dictionary == {'to': 6}
78+
79+
def test_ir_system():
80+
from collections import namedtuple
81+
Results = namedtuple('IRResults', ['score', 'url'])
82+
83+
uc = UnixConsultant()
84+
85+
def verify_query(query, expected):
86+
assert len(expected) == len(query)
87+
88+
for expected, (score, d) in zip(expected, query):
89+
doc = uc.documents[d]
90+
91+
assert expected.score == score * 100
92+
assert expected.url == doc.url
93+
94+
q1 = uc.query("how do I remove a file")
95+
assert verify_query(q1, [
96+
Results(76.83, "../aima-data/MAN/rm.txt"),
97+
Results(67.83, "../aima-data/MAN/tar.txt"),
98+
Results(67.79, "../aima-data/MAN/cp.txt"),
99+
Results(66.58, "../aima-data/MAN/zip.txt"),
100+
Results(64.58, "../aima-data/MAN/gzip.txt"),
101+
Results(63.74, "../aima-data/MAN/pine.txt"),
102+
Results(62.95, "../aima-data/MAN/shred.txt"),
103+
Results(57.46, "../aima-data/MAN/pico.txt"),
104+
Results(43.38, "../aima-data/MAN/login.txt"),
105+
Results(41.93, "../aima-data/MAN/ln.txt"),
106+
])
107+
108+
q2 = uc.query("how do I delete a file")
109+
assert verify_query(q2, [
110+
Results(75.47, "../aima-data/MAN/diff.txt"),
111+
Results(69.12, "../aima-data/MAN/pine.txt"),
112+
Results(63.56, "../aima-data/MAN/tar.txt"),
113+
Results(60.63, "../aima-data/MAN/zip.txt"),
114+
Results(57.46, "../aima-data/MAN/pico.txt"),
115+
Results(51.28, "../aima-data/MAN/shred.txt"),
116+
Results(26.72, "../aima-data/MAN/tr.txt"),
117+
])
118+
119+
q3 = uc.query("email")
120+
assert verify_query(q3, [
121+
Results(18.39, "../aima-data/MAN/pine.txt"),
122+
Results(12.01, "../aima-data/MAN/info.txt"),
123+
Results(9.89, "../aima-data/MAN/pico.txt"),
124+
Results(8.73, "../aima-data/MAN/grep.txt"),
125+
Results(8.07, "../aima-data/MAN/zip.txt"),
126+
])
127+
128+
q4 = uc.query("word countrs for files")
129+
assert verify_query(q4, [
130+
Results(112.38, "../aima-data/MAN/grep.txt"),
131+
Results(101.84, "../aima-data/MAN/wc.txt"),
132+
Results(82.46, "../aima-data/MAN/find.txt"),
133+
Results(74.64, "../aima-data/MAN/du.txt"),
134+
])
135+
136+
q5 = uc.query("learn: date")
137+
assert verify_query(q5, [])
138+
139+
q6 = uc.query("2003")
140+
assert verify_query(q6, [
141+
Results(14.58, "../aima-data/MAN/pine.txt"),
142+
Results(11.62, "../aima-data/MAN/jar.txt"),
143+
])
144+
145+
if __name__ == '__main__':
146+
pytest.main()

0 commit comments

Comments
 (0)