@@ -74,6 +74,72 @@ def test_text_models():
7474 assert len (P3 .dictionary ) == 3
7575
7676
77+ def test_char_models ():
78+ test_string = 'unigram'
79+ wordseq = words (test_string )
80+ P1 = NgramCharModel (1 , wordseq )
81+
82+ assert len (P1 .dictionary ) == len (test_string )
83+ for char in test_string :
84+ assert tuple (char ) in P1 .dictionary
85+
86+ test_string = 'a b c'
87+ wordseq = words (test_string )
88+ P1 = NgramCharModel (1 , wordseq )
89+
90+ assert len (P1 .dictionary ) == len (test_string .split ())
91+ for char in test_string .split ():
92+ assert tuple (char ) in P1 .dictionary
93+
94+ test_string = 'bigram'
95+ wordseq = words (test_string )
96+ P2 = NgramCharModel (2 , wordseq )
97+
98+ expected_bigrams = {(' ' , 'b' ): 1 , ('b' , 'i' ): 1 , ('i' , 'g' ): 1 , ('g' , 'r' ): 1 , ('r' , 'a' ): 1 , ('a' , 'm' ): 1 }
99+
100+ assert len (P2 .dictionary ) == len (expected_bigrams )
101+ for bigram , count in expected_bigrams .items ():
102+ assert bigram in P2 .dictionary
103+ assert P2 .dictionary [bigram ] == count
104+
105+ test_string = 'bigram bigram'
106+ wordseq = words (test_string )
107+ P2 = NgramCharModel (2 , wordseq )
108+
109+ expected_bigrams = {(' ' , 'b' ): 2 , ('b' , 'i' ): 2 , ('i' , 'g' ): 2 , ('g' , 'r' ): 2 , ('r' , 'a' ): 2 , ('a' , 'm' ): 2 }
110+
111+ assert len (P2 .dictionary ) == len (expected_bigrams )
112+ for bigram , count in expected_bigrams .items ():
113+ assert bigram in P2 .dictionary
114+ assert P2 .dictionary [bigram ] == count
115+
116+ test_string = 'trigram'
117+ wordseq = words (test_string )
118+ P3 = NgramCharModel (3 , wordseq )
119+
120+ expected_trigrams = {(' ' , ' ' , 't' ): 1 , (' ' , 't' , 'r' ): 1 , ('t' , 'r' , 'i' ): 1 ,
121+ ('r' , 'i' , 'g' ): 1 , ('i' , 'g' , 'r' ): 1 , ('g' , 'r' , 'a' ): 1 ,
122+ ('r' , 'a' , 'm' ): 1 }
123+
124+ assert len (P3 .dictionary ) == len (expected_trigrams )
125+ for bigram , count in expected_trigrams .items ():
126+ assert bigram in P3 .dictionary
127+ assert P3 .dictionary [bigram ] == count
128+
129+ test_string = 'trigram trigram trigram'
130+ wordseq = words (test_string )
131+ P3 = NgramCharModel (3 , wordseq )
132+
133+ expected_trigrams = {(' ' , ' ' , 't' ): 3 , (' ' , 't' , 'r' ): 3 , ('t' , 'r' , 'i' ): 3 ,
134+ ('r' , 'i' , 'g' ): 3 , ('i' , 'g' , 'r' ): 3 , ('g' , 'r' , 'a' ): 3 ,
135+ ('r' , 'a' , 'm' ): 3 }
136+
137+ assert len (P3 .dictionary ) == len (expected_trigrams )
138+ for bigram , count in expected_trigrams .items ():
139+ assert bigram in P3 .dictionary
140+ assert P3 .dictionary [bigram ] == count
141+
142+
77143def test_viterbi_segmentation ():
78144 flatland = DataFile ("EN-text/flatland.txt" ).read ()
79145 wordseq = words (flatland )
0 commit comments