Skip to content

Commit f051d97

Browse files
Refactored code for increased optimization. (#1139)
* refactoring by decompose conditional technique * refactoring by introduction explaining variable technique * refactoring by rename method/variable technique * refactoring by introducing explaining variable technique * Added Extract class refactoring to increase maintainablilty * Refactoring using replace conditional with polymorphism
1 parent 9b9f173 commit f051d97

File tree

7 files changed

+150
-74
lines changed

7 files changed

+150
-74
lines changed

webmagic-core/src/main/java/us/codecraft/webmagic/Page.java

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -169,18 +169,25 @@ public void addTargetRequests(Iterable<String> requests, long priority) {
169169
* @param priority Priority for the URL
170170
*/
171171
private void addRequestIfValid(String url, long priority) {
172-
if (StringUtils.isBlank(url) || url.equals("#") || url.startsWith("javascript:")) {
173-
return;
172+
boolean isBlankUrl = StringUtils.isBlank(url);
173+
boolean isHashSymbol = url.equals("#");
174+
boolean isJavaScript = url.startsWith("javascript:");
175+
176+
if (isBlankUrl || isHashSymbol || isJavaScript) {
177+
return; // Invalid URL, so no further processing is needed.
174178
}
175179

176180
String canonicalizedUrl = UrlUtils.canonicalizeUrl(url, this.url.toString());
177-
Request req = new Request(canonicalizedUrl);
178-
if(priority > 0) {
179-
req.setPriority(priority);
181+
Request request = new Request(canonicalizedUrl);
182+
183+
if (priority > 0) {
184+
request.setPriority(priority);
180185
}
181-
targetRequests.add(req);
186+
187+
targetRequests.add(request);
182188
}
183189

190+
184191
/**
185192
* add url to fetch
186193
*

webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,14 @@ public class HttpClientGenerator {
4040

4141
private PoolingHttpClientConnectionManager connectionManager;
4242

43+
private static final int DEFAULT_MAX_PER_ROUTE = 100;
4344
public HttpClientGenerator() {
4445
Registry<ConnectionSocketFactory> reg = RegistryBuilder.<ConnectionSocketFactory>create()
4546
.register("http", PlainConnectionSocketFactory.INSTANCE)
4647
.register("https", buildSSLConnectionSocketFactory())
4748
.build();
4849
connectionManager = new PoolingHttpClientConnectionManager(reg);
49-
connectionManager.setDefaultMaxPerRoute(100);
50+
connectionManager.setDefaultMaxPerRoute(DEFAULT_MAX_PER_ROUTE);
5051
}
5152

5253
private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() {

webmagic-core/src/main/java/us/codecraft/webmagic/model/HttpRequestBody.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ public void setEncoding(String encoding) {
6464
this.encoding = encoding;
6565
}
6666

67-
public static HttpRequestBody json(String json, String encoding) {
67+
public static HttpRequestBody createJsonRequestBody(String json, String encoding) {
6868
try {
6969
return new HttpRequestBody(json.getBytes(encoding), ContentType.JSON, encoding);
7070
} catch (UnsupportedEncodingException e) {
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
package us.codecraft.webmagic.selector;
2+
3+
import org.jsoup.nodes.Document;
4+
import org.jsoup.nodes.Element;
5+
6+
import java.util.ArrayList;
7+
import java.util.List;
8+
import java.util.ListIterator;
9+
10+
public class ElementsUtil {
11+
HtmlNode htmlNode = new HtmlNode();
12+
public Selectable selectElements(BaseElementSelector elementSelector) {
13+
ListIterator<Element> elementIterator = htmlNode.getElements().listIterator();
14+
if (!elementSelector.hasAttribute()) {
15+
List<Element> resultElements = new ArrayList<Element>();
16+
while (elementIterator.hasNext()) {
17+
Element element = checkElementAndConvert(elementIterator);
18+
List<Element> selectElements = elementSelector.selectElements(element);
19+
resultElements.addAll(selectElements);
20+
}
21+
return new HtmlNode(resultElements);
22+
} else {
23+
// has attribute, consider as plaintext
24+
List<String> resultStrings = new ArrayList<String>();
25+
while (elementIterator.hasNext()) {
26+
Element element = checkElementAndConvert(elementIterator);
27+
List<String> selectList = elementSelector.selectList(element);
28+
resultStrings.addAll(selectList);
29+
}
30+
return new PlainText(resultStrings);
31+
32+
}
33+
}
34+
35+
/**
36+
* Only document can be select
37+
* See: https://github.com/code4craft/webmagic/issues/113
38+
*
39+
* @param elementIterator elementIterator
40+
* @return element element
41+
*/
42+
public Element checkElementAndConvert(ListIterator<Element> elementIterator) {
43+
Element element = elementIterator.next();
44+
if (!(element instanceof Document)) {
45+
Document root = new Document(element.ownerDocument().baseUri());
46+
Element clone = element.clone();
47+
root.appendChild(clone);
48+
elementIterator.set(root);
49+
return root;
50+
}
51+
return element;
52+
}
53+
}

webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java

Lines changed: 10 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -33,19 +33,22 @@ public Selectable smartContent() {
3333

3434
@Override
3535
public Selectable links() {
36-
return selectElements(new LinksSelector());
36+
ElementsUtil elementsUtil = new ElementsUtil();
37+
return elementsUtil.selectElements(new LinksSelector());
3738
}
3839

3940
@Override
4041
public Selectable xpath(String xpath) {
42+
ElementsUtil elementsUtil = new ElementsUtil();
4143
XpathSelector xpathSelector = Selectors.xpath(xpath);
42-
return selectElements(xpathSelector);
44+
return elementsUtil.selectElements(xpathSelector);
4345
}
4446

4547
@Override
4648
public Selectable selectList(Selector selector) {
4749
if (selector instanceof BaseElementSelector) {
48-
return selectElements((BaseElementSelector) selector);
50+
ElementsUtil elementsUtil = new ElementsUtil();
51+
return elementsUtil.selectElements((BaseElementSelector) selector);
4952
}
5053
return selectList(selector, getSourceTexts());
5154
}
@@ -55,64 +58,18 @@ public Selectable select(Selector selector) {
5558
return selectList(selector);
5659
}
5760

58-
/**
59-
* select elements
60-
*
61-
* @param elementSelector elementSelector
62-
* @return result
63-
*/
64-
protected Selectable selectElements(BaseElementSelector elementSelector) {
65-
ListIterator<Element> elementIterator = getElements().listIterator();
66-
if (!elementSelector.hasAttribute()) {
67-
List<Element> resultElements = new ArrayList<Element>();
68-
while (elementIterator.hasNext()) {
69-
Element element = checkElementAndConvert(elementIterator);
70-
List<Element> selectElements = elementSelector.selectElements(element);
71-
resultElements.addAll(selectElements);
72-
}
73-
return new HtmlNode(resultElements);
74-
} else {
75-
// has attribute, consider as plaintext
76-
List<String> resultStrings = new ArrayList<String>();
77-
while (elementIterator.hasNext()) {
78-
Element element = checkElementAndConvert(elementIterator);
79-
List<String> selectList = elementSelector.selectList(element);
80-
resultStrings.addAll(selectList);
81-
}
82-
return new PlainText(resultStrings);
83-
84-
}
85-
}
86-
87-
/**
88-
* Only document can be select
89-
* See: https://github.com/code4craft/webmagic/issues/113
90-
*
91-
* @param elementIterator elementIterator
92-
* @return element element
93-
*/
94-
private Element checkElementAndConvert(ListIterator<Element> elementIterator) {
95-
Element element = elementIterator.next();
96-
if (!(element instanceof Document)) {
97-
Document root = new Document(element.ownerDocument().baseUri());
98-
Element clone = element.clone();
99-
root.appendChild(clone);
100-
elementIterator.set(root);
101-
return root;
102-
}
103-
return element;
104-
}
105-
10661
@Override
10762
public Selectable $(String selector) {
63+
ElementsUtil elementsUtil = new ElementsUtil();
10864
CssSelector cssSelector = Selectors.$(selector);
109-
return selectElements(cssSelector);
65+
return elementsUtil.selectElements(cssSelector);
11066
}
11167

11268
@Override
11369
public Selectable $(String selector, String attrName) {
70+
ElementsUtil elementsUtil = new ElementsUtil();
11471
CssSelector cssSelector = Selectors.$(selector, attrName);
115-
return selectElements(cssSelector);
72+
return elementsUtil.selectElements(cssSelector);
11673
}
11774

11875
@Override

webmagic-extension/src/main/java/us/codecraft/webmagic/configurable/ExtractRule.java

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -76,26 +76,27 @@ public Selector getSelector() {
7676
}
7777

7878
private Selector compileSelector() {
79+
SelectorFactory factory;
7980
switch (expressionType) {
8081
case Css:
81-
if (expressionParams.length >= 1) {
82-
return $(expressionValue, expressionParams[0]);
83-
} else {
84-
return $(expressionValue);
85-
}
82+
factory = new CssSelectorFactory();
83+
break;
8684
case XPath:
87-
return xpath(expressionValue);
85+
factory = new XPathSelectorFactory();
86+
break;
8887
case Regex:
89-
if (expressionParams.length >= 1) {
90-
return regex(expressionValue, Integer.parseInt(expressionParams[0]));
91-
} else {
92-
return regex(expressionValue);
93-
}
88+
factory = new RegexSelectorFactory();
89+
break;
9490
case JsonPath:
95-
return new JsonPathSelector(expressionValue);
91+
factory = new JsonPathSelectorFactory();
92+
break;
9693
default:
97-
return xpath(expressionValue);
94+
factory = new XPathSelectorFactory(); // Default to XPath
9895
}
96+
97+
SelectorCompiler selectorCompiler = new SelectorCompiler(factory);
98+
Selector compiledSelector = selectorCompiler.compileSelector(expressionValue, expressionParams);
99+
return compiledSelector;
99100
}
100101

101102
public void setSelector(Selector selector) {
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
package us.codecraft.webmagic.configurable;
2+
3+
import us.codecraft.webmagic.selector.JsonPathSelector;
4+
import us.codecraft.webmagic.selector.Selector;
5+
6+
import static us.codecraft.webmagic.selector.Selectors.*;
7+
public interface SelectorFactory {
8+
Selector compileSelector(String expressionValue, String[] expressionParams);
9+
}
10+
11+
class CssSelectorFactory implements SelectorFactory {
12+
@Override
13+
public Selector compileSelector(String expressionValue, String[] expressionParams) {
14+
if (expressionParams.length >= 1) {
15+
return $(expressionValue, expressionParams[0]);
16+
} else {
17+
return $(expressionValue);
18+
}
19+
}
20+
}
21+
22+
class XPathSelectorFactory implements SelectorFactory {
23+
@Override
24+
public Selector compileSelector(String expressionValue, String[] expressionParams) {
25+
return xpath(expressionValue);
26+
}
27+
}
28+
29+
class RegexSelectorFactory implements SelectorFactory {
30+
@Override
31+
public Selector compileSelector(String expressionValue, String[] expressionParams) {
32+
if (expressionParams.length >= 1) {
33+
return regex(expressionValue, Integer.parseInt(expressionParams[0]));
34+
} else {
35+
return regex(expressionValue);
36+
}
37+
}
38+
}
39+
40+
class JsonPathSelectorFactory implements SelectorFactory {
41+
@Override
42+
public Selector compileSelector(String expressionValue, String[] expressionParams) {
43+
return new JsonPathSelector(expressionValue);
44+
}
45+
}
46+
47+
class SelectorCompiler {
48+
private final SelectorFactory selectorFactory;
49+
50+
public SelectorCompiler(SelectorFactory selectorFactory) {
51+
this.selectorFactory = selectorFactory;
52+
}
53+
54+
public Selector compileSelector(String expressionValue, String[] expressionParams) {
55+
return selectorFactory.compileSelector(expressionValue, expressionParams);
56+
}
57+
}

0 commit comments

Comments
 (0)