Skip to content

Commit 39db351

Browse files
antmarakisnorvig
authored andcommitted
N-gram Text Models (aimacode#573)
* Update text.py * Update test_text.py * Update text.ipynb * 'sentences' to 'samples'
1 parent e5da461 commit 39db351

File tree

3 files changed

+289
-105
lines changed

3 files changed

+289
-105
lines changed

tests/test_text.py

Lines changed: 47 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -6,71 +6,54 @@
66
from utils import isclose, open_data
77

88

9+
910
def test_text_models():
1011
flatland = open_data("EN-text/flatland.txt").read()
1112
wordseq = words(flatland)
1213
P1 = UnigramWordModel(wordseq)
1314
P2 = NgramWordModel(2, wordseq)
1415
P3 = NgramWordModel(3, wordseq)
1516

16-
# The most frequent entries in each model
17-
assert P1.top(10) == [(2081, 'the'), (1479, 'of'), (1021, 'and'),
18-
(1008, 'to'), (850, 'a'), (722, 'i'), (640, 'in'),
19-
(478, 'that'), (399, 'is'), (348, 'you')]
20-
21-
assert P2.top(10) == [(368, ('of', 'the')), (152, ('to', 'the')),
22-
(152, ('in', 'the')), (86, ('of', 'a')),
23-
(80, ('it', 'is')),
24-
(71, ('by', 'the')), (68, ('for', 'the')),
25-
(68, ('and', 'the')), (62, ('on', 'the')),
26-
(60, ('to', 'be'))]
27-
28-
assert P3.top(10) == [(30, ('a', 'straight', 'line')),
29-
(19, ('of', 'three', 'dimensions')),
30-
(16, ('the', 'sense', 'of')),
31-
(13, ('by', 'the', 'sense')),
32-
(13, ('as', 'well', 'as')),
33-
(12, ('of', 'the', 'circles')),
34-
(12, ('of', 'sight', 'recognition')),
35-
(11, ('the', 'number', 'of')),
36-
(11, ('that', 'i', 'had')), (11, ('so', 'as', 'to'))]
17+
# Test top
18+
assert P1.top(5) == [(2081, 'the'), (1479, 'of'),
19+
(1021, 'and'), (1008, 'to'),
20+
(850, 'a')]
3721

38-
assert isclose(P1['the'], 0.0611, rel_tol=0.001)
22+
assert P2.top(5) == [(368, ('of', 'the')), (152, ('to', 'the')),
23+
(152, ('in', 'the')), (86, ('of', 'a')),
24+
(80, ('it', 'is'))]
3925

40-
assert isclose(P2['of', 'the'], 0.0108, rel_tol=0.01)
26+
assert P3.top(5) == [(30, ('a', 'straight', 'line')),
27+
(19, ('of', 'three', 'dimensions')),
28+
(16, ('the', 'sense', 'of')),
29+
(13, ('by', 'the', 'sense')),
30+
(13, ('as', 'well', 'as'))]
4131

42-
assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001)
32+
# Test isclose
33+
assert isclose(P1['the'], 0.0611, rel_tol=0.001)
34+
assert isclose(P2['of', 'the'], 0.0108, rel_tol=0.01)
4335
assert isclose(P3['so', 'as', 'to'], 0.000323, rel_tol=0.001)
4436

37+
# Test cond_prob.get
4538
assert P2.cond_prob.get(('went',)) is None
46-
4739
assert P3.cond_prob['in', 'order'].dictionary == {'to': 6}
4840

41+
# Test dictionary
4942
test_string = 'unigram'
5043
wordseq = words(test_string)
51-
5244
P1 = UnigramWordModel(wordseq)
53-
5445
assert P1.dictionary == {('unigram'): 1}
5546

5647
test_string = 'bigram text'
5748
wordseq = words(test_string)
58-
5949
P2 = NgramWordModel(2, wordseq)
50+
assert P2.dictionary == {('bigram', 'text'): 1}
6051

61-
assert (P2.dictionary == {('', 'bigram'): 1, ('bigram', 'text'): 1} or
62-
P2.dictionary == {('bigram', 'text'): 1, ('', 'bigram'): 1})
63-
64-
65-
test_string = 'test trigram text'
52+
test_string = 'test trigram text here'
6653
wordseq = words(test_string)
67-
6854
P3 = NgramWordModel(3, wordseq)
69-
70-
assert ('', '', 'test') in P3.dictionary
71-
assert ('', 'test', 'trigram') in P3.dictionary
7255
assert ('test', 'trigram', 'text') in P3.dictionary
73-
assert len(P3.dictionary) == 3
56+
assert ('trigram', 'text', 'here') in P3.dictionary
7457

7558

7659
def test_char_models():
@@ -83,12 +66,12 @@ def test_char_models():
8366
for char in test_string.replace(' ', ''):
8467
assert char in P1.dictionary
8568

86-
test_string = 'a b c'
69+
test_string = 'alpha beta'
8770
wordseq = words(test_string)
8871
P1 = NgramCharModel(1, wordseq)
8972

90-
assert len(P1.dictionary) == len(test_string.split())
91-
for char in test_string.split():
73+
assert len(P1.dictionary) == len(set(test_string))
74+
for char in set(test_string):
9275
assert tuple(char) in P1.dictionary
9376

9477
test_string = 'bigram'
@@ -116,10 +99,9 @@ def test_char_models():
11699
test_string = 'trigram'
117100
wordseq = words(test_string)
118101
P3 = NgramCharModel(3, wordseq)
119-
120-
expected_trigrams = {(' ', ' ', 't'): 1, (' ', 't', 'r'): 1, ('t', 'r', 'i'): 1,
121-
('r', 'i', 'g'): 1, ('i', 'g', 'r'): 1, ('g', 'r', 'a'): 1,
122-
('r', 'a', 'm'): 1}
102+
expected_trigrams = {(' ', 't', 'r'): 1, ('t', 'r', 'i'): 1,
103+
('r', 'i', 'g'): 1, ('i', 'g', 'r'): 1,
104+
('g', 'r', 'a'): 1, ('r', 'a', 'm'): 1}
123105

124106
assert len(P3.dictionary) == len(expected_trigrams)
125107
for bigram, count in expected_trigrams.items():
@@ -129,17 +111,33 @@ def test_char_models():
129111
test_string = 'trigram trigram trigram'
130112
wordseq = words(test_string)
131113
P3 = NgramCharModel(3, wordseq)
132-
133-
expected_trigrams = {(' ', ' ', 't'): 3, (' ', 't', 'r'): 3, ('t', 'r', 'i'): 3,
134-
('r', 'i', 'g'): 3, ('i', 'g', 'r'): 3, ('g', 'r', 'a'): 3,
135-
('r', 'a', 'm'): 3}
114+
expected_trigrams = {(' ', 't', 'r'): 3, ('t', 'r', 'i'): 3,
115+
('r', 'i', 'g'): 3, ('i', 'g', 'r'): 3,
116+
('g', 'r', 'a'): 3, ('r', 'a', 'm'): 3}
136117

137118
assert len(P3.dictionary) == len(expected_trigrams)
138119
for bigram, count in expected_trigrams.items():
139120
assert bigram in P3.dictionary
140121
assert P3.dictionary[bigram] == count
141122

142123

124+
def test_samples():
125+
story = open_data("EN-text/flatland.txt").read()
126+
story += open_data("EN-text/gutenberg.txt").read()
127+
wordseq = words(story)
128+
P1 = UnigramWordModel(wordseq)
129+
P2 = NgramWordModel(2, wordseq)
130+
P3 = NgramWordModel(3, wordseq)
131+
132+
s1 = P1.samples(10)
133+
s2 = P3.samples(10)
134+
s3 = P3.samples(10)
135+
136+
assert len(s1.split(' ')) == 10
137+
assert len(s2.split(' ')) == 10
138+
assert len(s3.split(' ')) == 10
139+
140+
143141
def test_viterbi_segmentation():
144142
flatland = open_data("EN-text/flatland.txt").read()
145143
wordseq = words(flatland)
@@ -293,18 +291,6 @@ def test_bigrams():
293291
assert bigrams(['this', 'is', 'a', 'test']) == [['this', 'is'], ['is', 'a'], ['a', 'test']]
294292

295293

296-
# TODO: for .ipynb
297-
"""
298-
299-
>>> P1.samples(20)
300-
'you thought known but were insides of see in depend by us dodecahedrons just but i words are instead degrees'
301-
302-
>>> P2.samples(20)
303-
'flatland well then can anything else more into the total destruction and circles teach others confine women must be added'
304-
305-
>>> P3.samples(20)
306-
'flatland by edwin a abbott 1884 to the wake of a certificate from nature herself proving the equal sided triangle'
307-
"""
308294

309295
if __name__ == '__main__':
310296
pytest.main()

0 commit comments

Comments
 (0)