11import  pytest 
22import  nlp 
3+ 
34from  nlp  import  loadPageHTML , stripRawHTML , findOutlinks , onlyWikipediaURLS 
45from  nlp  import  expand_pages , relevant_pages , normalize , ConvergenceDetector , getInlinks 
56from  nlp  import  getOutlinks , Page 
67from  nlp  import  Rules , Lexicon 
78# Clumsy imports because we want to access certain nlp.py globals explicitly, because 
89# they are accessed by function's within nlp.py 
910
11+ from  unittest .mock  import  patch 
12+ from  io  import  BytesIO 
13+ 
1014
1115def  test_rules ():
1216    assert  Rules (A = "B C | D E" ) ==  {'A' : [['B' , 'C' ], ['D' , 'E' ]]}
@@ -27,6 +31,19 @@ def test_lexicon():
2731            < href="/wiki/TestThing" > href="/wiki/TestBoy" 
2832            href="/wiki/TestLiving" href="/wiki/TestMan" >""" 
2933testHTML2  =  "Nothing" 
34+ testHTML3  =  """ 
35+             <!DOCTYPE html> 
36+             <html> 
37+             <head> 
38+             <title>Page Title</title> 
39+             </head> 
40+             <body> 
41+ 
42+             <p>AIMA book</p> 
43+ 
44+             </body> 
45+             </html> 
46+             """ 
3047
3148pA  =  Page ("A" , 1 , 6 , ["B" , "C" , "E" ], ["D" ])
3249pB  =  Page ("B" , 2 , 5 , ["E" ], ["A" , "C" , "D" ])
@@ -52,12 +69,14 @@ def test_lexicon():
5269#     assert all(loadedPages.get(key,"") != "" for key in addresses) 
5370
5471
55- def  test_stripRawHTML ():
72+ @patch ('urllib.request.urlopen' , return_value = BytesIO (testHTML3 .encode ())) 
73+ def  test_stripRawHTML (html_mock ):
5674    addr  =  "https://en.wikipedia.org/wiki/Ethics" 
5775    aPage  =  loadPageHTML ([addr ])
5876    someHTML  =  aPage [addr ]
5977    strippedHTML  =  stripRawHTML (someHTML )
6078    assert  "<head>"  not  in strippedHTML  and  "</head>"  not  in strippedHTML 
79+     assert  "AIMA book"  in  someHTML  and  "AIMA book"  in  strippedHTML 
6180
6281
6382def  test_determineInlinks ():
0 commit comments