Skip to content

Commit 31e3b65

Browse files
antmarakisnorvig
authored andcommitted
Text: Text Models (aimacode#571)
* unigram char model + rename text to word models * Update test_text.py * Update text.ipynb * remove duplicate assert
1 parent b40ecc0 commit 31e3b65

File tree

3 files changed

+106
-131
lines changed

3 files changed

+106
-131
lines changed

tests/test_text.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@
99
def test_text_models():
1010
flatland = open_data("EN-text/flatland.txt").read()
1111
wordseq = words(flatland)
12-
P1 = UnigramTextModel(wordseq)
13-
P2 = NgramTextModel(2, wordseq)
14-
P3 = NgramTextModel(3, wordseq)
12+
P1 = UnigramWordModel(wordseq)
13+
P2 = NgramWordModel(2, wordseq)
14+
P3 = NgramWordModel(3, wordseq)
1515

1616
# The most frequent entries in each model
1717
assert P1.top(10) == [(2081, 'the'), (1479, 'of'), (1021, 'and'),
@@ -39,7 +39,6 @@ def test_text_models():
3939

4040
assert isclose(P2['of', 'the'], 0.0108, rel_tol=0.01)
4141

42-
assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001)
4342
assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001)
4443
assert isclose(P3['so', 'as', 'to'], 0.000323, rel_tol=0.001)
4544

@@ -50,14 +49,14 @@ def test_text_models():
5049
test_string = 'unigram'
5150
wordseq = words(test_string)
5251

53-
P1 = UnigramTextModel(wordseq)
52+
P1 = UnigramWordModel(wordseq)
5453

5554
assert P1.dictionary == {('unigram'): 1}
5655

5756
test_string = 'bigram text'
5857
wordseq = words(test_string)
5958

60-
P2 = NgramTextModel(2, wordseq)
59+
P2 = NgramWordModel(2, wordseq)
6160

6261
assert (P2.dictionary == {('', 'bigram'): 1, ('bigram', 'text'): 1} or
6362
P2.dictionary == {('bigram', 'text'): 1, ('', 'bigram'): 1})
@@ -66,7 +65,7 @@ def test_text_models():
6665
test_string = 'test trigram text'
6766
wordseq = words(test_string)
6867

69-
P3 = NgramTextModel(3, wordseq)
68+
P3 = NgramWordModel(3, wordseq)
7069

7170
assert ('', '', 'test') in P3.dictionary
7271
assert ('', 'test', 'trigram') in P3.dictionary
@@ -75,13 +74,14 @@ def test_text_models():
7574

7675

7776
def test_char_models():
78-
test_string = 'unigram'
77+
test_string = 'test unigram'
7978
wordseq = words(test_string)
80-
P1 = NgramCharModel(1, wordseq)
79+
P1 = UnigramCharModel(wordseq)
8180

82-
assert len(P1.dictionary) == len(test_string)
83-
for char in test_string:
84-
assert tuple(char) in P1.dictionary
81+
expected_unigrams = {'n': 1, 's': 1, 'e': 1, 'i': 1, 'm': 1, 'g': 1, 'r': 1, 'a': 1, 't': 2, 'u': 1}
82+
assert len(P1.dictionary) == len(expected_unigrams)
83+
for char in test_string.replace(' ', ''):
84+
assert char in P1.dictionary
8585

8686
test_string = 'a b c'
8787
wordseq = words(test_string)
@@ -143,7 +143,7 @@ def test_char_models():
143143
def test_viterbi_segmentation():
144144
flatland = open_data("EN-text/flatland.txt").read()
145145
wordseq = words(flatland)
146-
P = UnigramTextModel(wordseq)
146+
P = UnigramWordModel(wordseq)
147147
text = "itiseasytoreadwordswithoutspaces"
148148

149149
s, p = viterbi_segment(text, P)

0 commit comments

Comments
 (0)