66from utils import isclose , open_data
77
88
9+
910def test_text_models ():
1011 flatland = open_data ("EN-text/flatland.txt" ).read ()
1112 wordseq = words (flatland )
1213 P1 = UnigramWordModel (wordseq )
1314 P2 = NgramWordModel (2 , wordseq )
1415 P3 = NgramWordModel (3 , wordseq )
1516
16- # The most frequent entries in each model
17- assert P1 .top (10 ) == [(2081 , 'the' ), (1479 , 'of' ), (1021 , 'and' ),
18- (1008 , 'to' ), (850 , 'a' ), (722 , 'i' ), (640 , 'in' ),
19- (478 , 'that' ), (399 , 'is' ), (348 , 'you' )]
20-
21- assert P2 .top (10 ) == [(368 , ('of' , 'the' )), (152 , ('to' , 'the' )),
22- (152 , ('in' , 'the' )), (86 , ('of' , 'a' )),
23- (80 , ('it' , 'is' )),
24- (71 , ('by' , 'the' )), (68 , ('for' , 'the' )),
25- (68 , ('and' , 'the' )), (62 , ('on' , 'the' )),
26- (60 , ('to' , 'be' ))]
27-
28- assert P3 .top (10 ) == [(30 , ('a' , 'straight' , 'line' )),
29- (19 , ('of' , 'three' , 'dimensions' )),
30- (16 , ('the' , 'sense' , 'of' )),
31- (13 , ('by' , 'the' , 'sense' )),
32- (13 , ('as' , 'well' , 'as' )),
33- (12 , ('of' , 'the' , 'circles' )),
34- (12 , ('of' , 'sight' , 'recognition' )),
35- (11 , ('the' , 'number' , 'of' )),
36- (11 , ('that' , 'i' , 'had' )), (11 , ('so' , 'as' , 'to' ))]
17+ # Test top
18+ assert P1 .top (5 ) == [(2081 , 'the' ), (1479 , 'of' ),
19+ (1021 , 'and' ), (1008 , 'to' ),
20+ (850 , 'a' )]
3721
38- assert isclose (P1 ['the' ], 0.0611 , rel_tol = 0.001 )
22+ assert P2 .top (5 ) == [(368 , ('of' , 'the' )), (152 , ('to' , 'the' )),
23+ (152 , ('in' , 'the' )), (86 , ('of' , 'a' )),
24+ (80 , ('it' , 'is' ))]
3925
40- assert isclose (P2 ['of' , 'the' ], 0.0108 , rel_tol = 0.01 )
26+ assert P3 .top (5 ) == [(30 , ('a' , 'straight' , 'line' )),
27+ (19 , ('of' , 'three' , 'dimensions' )),
28+ (16 , ('the' , 'sense' , 'of' )),
29+ (13 , ('by' , 'the' , 'sense' )),
30+ (13 , ('as' , 'well' , 'as' ))]
4131
42- assert isclose (P3 ['' , '' , 'but' ], 0.0 , rel_tol = 0.001 )
32+ # Test isclose
33+ assert isclose (P1 ['the' ], 0.0611 , rel_tol = 0.001 )
34+ assert isclose (P2 ['of' , 'the' ], 0.0108 , rel_tol = 0.01 )
4335 assert isclose (P3 ['so' , 'as' , 'to' ], 0.000323 , rel_tol = 0.001 )
4436
37+ # Test cond_prob.get
4538 assert P2 .cond_prob .get (('went' ,)) is None
46-
4739 assert P3 .cond_prob ['in' , 'order' ].dictionary == {'to' : 6 }
4840
41+ # Test dictionary
4942 test_string = 'unigram'
5043 wordseq = words (test_string )
51-
5244 P1 = UnigramWordModel (wordseq )
53-
5445 assert P1 .dictionary == {('unigram' ): 1 }
5546
5647 test_string = 'bigram text'
5748 wordseq = words (test_string )
58-
5949 P2 = NgramWordModel (2 , wordseq )
50+ assert P2 .dictionary == {('bigram' , 'text' ): 1 }
6051
61- assert (P2 .dictionary == {('' , 'bigram' ): 1 , ('bigram' , 'text' ): 1 } or
62- P2 .dictionary == {('bigram' , 'text' ): 1 , ('' , 'bigram' ): 1 })
63-
64-
65- test_string = 'test trigram text'
52+ test_string = 'test trigram text here'
6653 wordseq = words (test_string )
67-
6854 P3 = NgramWordModel (3 , wordseq )
69-
70- assert ('' , '' , 'test' ) in P3 .dictionary
71- assert ('' , 'test' , 'trigram' ) in P3 .dictionary
7255 assert ('test' , 'trigram' , 'text' ) in P3 .dictionary
73- assert len ( P3 . dictionary ) == 3
56+ assert ( 'trigram' , 'text' , 'here' ) in P3 . dictionary
7457
7558
7659def test_char_models ():
@@ -83,12 +66,12 @@ def test_char_models():
8366 for char in test_string .replace (' ' , '' ):
8467 assert char in P1 .dictionary
8568
86- test_string = 'a b c '
69+ test_string = 'alpha beta '
8770 wordseq = words (test_string )
8871 P1 = NgramCharModel (1 , wordseq )
8972
90- assert len (P1 .dictionary ) == len (test_string . split ( ))
91- for char in test_string . split ( ):
73+ assert len (P1 .dictionary ) == len (set ( test_string ))
74+ for char in set ( test_string ):
9275 assert tuple (char ) in P1 .dictionary
9376
9477 test_string = 'bigram'
@@ -116,10 +99,9 @@ def test_char_models():
11699 test_string = 'trigram'
117100 wordseq = words (test_string )
118101 P3 = NgramCharModel (3 , wordseq )
119-
120- expected_trigrams = {(' ' , ' ' , 't' ): 1 , (' ' , 't' , 'r' ): 1 , ('t' , 'r' , 'i' ): 1 ,
121- ('r' , 'i' , 'g' ): 1 , ('i' , 'g' , 'r' ): 1 , ('g' , 'r' , 'a' ): 1 ,
122- ('r' , 'a' , 'm' ): 1 }
102+ expected_trigrams = {(' ' , 't' , 'r' ): 1 , ('t' , 'r' , 'i' ): 1 ,
103+ ('r' , 'i' , 'g' ): 1 , ('i' , 'g' , 'r' ): 1 ,
104+ ('g' , 'r' , 'a' ): 1 , ('r' , 'a' , 'm' ): 1 }
123105
124106 assert len (P3 .dictionary ) == len (expected_trigrams )
125107 for bigram , count in expected_trigrams .items ():
@@ -129,17 +111,33 @@ def test_char_models():
129111 test_string = 'trigram trigram trigram'
130112 wordseq = words (test_string )
131113 P3 = NgramCharModel (3 , wordseq )
132-
133- expected_trigrams = {(' ' , ' ' , 't' ): 3 , (' ' , 't' , 'r' ): 3 , ('t' , 'r' , 'i' ): 3 ,
134- ('r' , 'i' , 'g' ): 3 , ('i' , 'g' , 'r' ): 3 , ('g' , 'r' , 'a' ): 3 ,
135- ('r' , 'a' , 'm' ): 3 }
114+ expected_trigrams = {(' ' , 't' , 'r' ): 3 , ('t' , 'r' , 'i' ): 3 ,
115+ ('r' , 'i' , 'g' ): 3 , ('i' , 'g' , 'r' ): 3 ,
116+ ('g' , 'r' , 'a' ): 3 , ('r' , 'a' , 'm' ): 3 }
136117
137118 assert len (P3 .dictionary ) == len (expected_trigrams )
138119 for bigram , count in expected_trigrams .items ():
139120 assert bigram in P3 .dictionary
140121 assert P3 .dictionary [bigram ] == count
141122
142123
124+ def test_samples ():
125+ story = open_data ("EN-text/flatland.txt" ).read ()
126+ story += open_data ("EN-text/gutenberg.txt" ).read ()
127+ wordseq = words (story )
128+ P1 = UnigramWordModel (wordseq )
129+ P2 = NgramWordModel (2 , wordseq )
130+ P3 = NgramWordModel (3 , wordseq )
131+
132+ s1 = P1 .samples (10 )
133+ s2 = P3 .samples (10 )
134+ s3 = P3 .samples (10 )
135+
136+ assert len (s1 .split (' ' )) == 10
137+ assert len (s2 .split (' ' )) == 10
138+ assert len (s3 .split (' ' )) == 10
139+
140+
143141def test_viterbi_segmentation ():
144142 flatland = open_data ("EN-text/flatland.txt" ).read ()
145143 wordseq = words (flatland )
@@ -293,18 +291,6 @@ def test_bigrams():
293291 assert bigrams (['this' , 'is' , 'a' , 'test' ]) == [['this' , 'is' ], ['is' , 'a' ], ['a' , 'test' ]]
294292
295293
296- # TODO: for .ipynb
297- """
298-
299- >>> P1.samples(20)
300- 'you thought known but were insides of see in depend by us dodecahedrons just but i words are instead degrees'
301-
302- >>> P2.samples(20)
303- 'flatland well then can anything else more into the total destruction and circles teach others confine women must be added'
304-
305- >>> P3.samples(20)
306- 'flatland by edwin a abbott 1884 to the wake of a certificate from nature herself proving the equal sided triangle'
307- """
308294
309295if __name__ == '__main__' :
310296 pytest .main ()
0 commit comments