11import pytest
22import nlp
3+
34from nlp import loadPageHTML , stripRawHTML , findOutlinks , onlyWikipediaURLS
45from nlp import expand_pages , relevant_pages , normalize , ConvergenceDetector , getInlinks
56from nlp import getOutlinks , Page
67from nlp import Rules , Lexicon
78# Clumsy imports because we want to access certain nlp.py globals explicitly, because
89# they are accessed by function's within nlp.py
910
11+ from unittest .mock import patch
12+ from io import BytesIO
13+
1014
1115def test_rules ():
1216 assert Rules (A = "B C | D E" ) == {'A' : [['B' , 'C' ], ['D' , 'E' ]]}
@@ -27,6 +31,19 @@ def test_lexicon():
2731 < href="/wiki/TestThing" > href="/wiki/TestBoy"
2832 href="/wiki/TestLiving" href="/wiki/TestMan" >"""
2933testHTML2 = "Nothing"
34+ testHTML3 = """
35+ <!DOCTYPE html>
36+ <html>
37+ <head>
38+ <title>Page Title</title>
39+ </head>
40+ <body>
41+
42+ <p>AIMA book</p>
43+
44+ </body>
45+ </html>
46+ """
3047
3148pA = Page ("A" , 1 , 6 , ["B" , "C" , "E" ], ["D" ])
3249pB = Page ("B" , 2 , 5 , ["E" ], ["A" , "C" , "D" ])
@@ -52,12 +69,14 @@ def test_lexicon():
5269# assert all(loadedPages.get(key,"") != "" for key in addresses)
5370
5471
55- def test_stripRawHTML ():
72+ @patch ('urllib.request.urlopen' , return_value = BytesIO (testHTML3 .encode ()))
73+ def test_stripRawHTML (html_mock ):
5674 addr = "https://en.wikipedia.org/wiki/Ethics"
5775 aPage = loadPageHTML ([addr ])
5876 someHTML = aPage [addr ]
5977 strippedHTML = stripRawHTML (someHTML )
6078 assert "<head>" not in strippedHTML and "</head>" not in strippedHTML
79+ assert "AIMA book" in someHTML and "AIMA book" in strippedHTML
6180
6281
6382def test_determineInlinks ():
0 commit comments