diff --git a/README-zh.md b/README-zh.md index 62b3c9a5e..c3c4b72ea 100644 --- a/README-zh.md +++ b/README-zh.md @@ -39,12 +39,12 @@ webmagic使用maven管理依赖,在项目中添加对应的依赖即可使用w us.codecraft webmagic-core - 0.7.5 + ${webmagic.version} us.codecraft webmagic-extension - 0.7.5 + ${webmagic.version} ``` diff --git a/README.md b/README.md index 14aeac7b1..750a76841 100644 --- a/README.md +++ b/README.md @@ -25,12 +25,12 @@ Add dependencies to your pom.xml: us.codecraft webmagic-core - 0.7.5 + ${webmagic.version} us.codecraft webmagic-extension - 0.7.5 + ${webmagic.version} ``` diff --git a/pom.xml b/pom.xml index 68bf76d9c..9e4c45077 100644 --- a/pom.xml +++ b/pom.xml @@ -1,7 +1,7 @@ us.codecraft - 0.8.0 + 0.9.0 4.0.0 pom @@ -16,13 +16,13 @@ 3.12.0 2.0.19.graal 3.0.13 - 31.1-jre + 32.0.0-jre 2.26 4.5.13 4.4.15 3.7.1 9.3.9.0 - 2.7.0 + 2.8.0 4.13.2 2.7.3 1.2.17 @@ -124,7 +124,7 @@ us.codecraft xsoup - 0.3.6 + 0.3.7 com.alibaba diff --git a/webmagic-core/pom.xml b/webmagic-core/pom.xml index 997eb812c..b4feb1671 100644 --- a/webmagic-core/pom.xml +++ b/webmagic-core/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.8.0 + 0.9.0 4.0.0 diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java index c11df693c..6370171df 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Page.java @@ -20,7 +20,7 @@ * {@link #getHtml()} get content of current page
* {@link #putField(String, Object)} save extracted result
* {@link #getResultItems()} get extract results to be used in {@link us.codecraft.webmagic.pipeline.Pipeline}
- * {@link #addTargetRequests(java.util.List)} {@link #addTargetRequest(String)} add urls to fetch
+ * {@link #addTargetRequests(Iterable)} {@link #addTargetRequest(String)} add urls to fetch
* * @author code4crafter@gmail.com
* @see us.codecraft.webmagic.downloader.Downloader @@ -52,7 +52,7 @@ public class Page { private List targetRequests = new ArrayList(); private String charset; - + public Page() { } @@ -108,7 +108,8 @@ public Json getJson() { * @deprecated since 0.4.0 * The html is parse just when first time of calling {@link #getHtml()}, so use {@link #setRawText(String)} instead. */ - public void setHtml(Html html) { + @Deprecated + public void setHtml(Html html) { this.html = html; } @@ -121,7 +122,7 @@ public List getTargetRequests() { * * @param requests requests */ - public void addTargetRequests(List requests) { + public void addTargetRequests(Iterable requests) { for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { continue; @@ -137,7 +138,7 @@ public void addTargetRequests(List requests) { * @param requests requests * @param priority priority */ - public void addTargetRequests(List requests, long priority) { + public void addTargetRequests(Iterable requests, long priority) { for (String s : requests) { if (StringUtils.isBlank(s) || s.equals("#") || s.startsWith("javascript:")) { continue; diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java index 4879b2825..230337756 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/Site.java @@ -28,6 +28,8 @@ public class Site { private String charset; + private String defaultCharset; + private int sleepTime = 5000; private int retryTimes = 0; @@ -168,6 +170,30 @@ public String getCharset() { return charset; } + /** + * Set default charset of page. + * + * When charset detect failed, use this default charset. + * + * @param defaultCharset the default charset + * @return this + * @since 0.9.0 + */ + public Site setDefaultCharset(String defaultCharset) { + this.defaultCharset = defaultCharset; + return this; + } + + /** + * The default charset if charset detected failed. + * + * @return the defulat charset + * @since 0.9.0 + */ + public String getDefaultCharset() { + return defaultCharset; + } + public int getTimeOut() { return timeOut; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java index 72821f3c1..2f3ef58ed 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientDownloader.java @@ -4,6 +4,7 @@ import java.nio.charset.Charset; import java.util.HashMap; import java.util.Map; +import java.util.Optional; import org.apache.commons.io.IOUtils; import org.apache.http.HttpResponse; @@ -76,7 +77,7 @@ public Page download(Request request, Task task) { } CloseableHttpResponse httpResponse = null; CloseableHttpClient httpClient = getHttpClient(task.getSite()); - Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(task) : null; + Proxy proxy = proxyProvider != null ? proxyProvider.getProxy(request, task) : null; HttpClientRequestContext requestContext = httpUriRequestConverter.convert(request, task.getSite(), proxy); Page page = Page.fail(); try { @@ -116,7 +117,7 @@ protected Page handleResponse(Request request, String charset, HttpResponse http page.setBytes(bytes); if (!request.isBinaryContent()) { if (charset == null) { - charset = getHtmlCharset(contentType, bytes); + charset = getHtmlCharset(contentType, bytes, task); } page.setCharset(charset); page.setRawText(new String(bytes, charset)); @@ -131,11 +132,11 @@ protected Page handleResponse(Request request, String charset, HttpResponse http return page; } - private String getHtmlCharset(String contentType, byte[] contentBytes) throws IOException { + private String getHtmlCharset(String contentType, byte[] contentBytes, Task task) throws IOException { String charset = CharsetUtils.detectCharset(contentType, contentBytes); if (charset == null) { - charset = Charset.defaultCharset().name(); - logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset()); + charset = Optional.ofNullable(task.getSite().getDefaultCharset()).orElseGet(Charset.defaultCharset()::name); + logger.info("Charset autodetect failed, use {} as charset.", task.getSite().getDefaultCharset()); } return charset; } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java index 80e0f1085..167a5e1c6 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/downloader/HttpClientGenerator.java @@ -1,16 +1,5 @@ package us.codecraft.webmagic.downloader; -import java.io.IOException; -import java.security.KeyManagementException; -import java.security.NoSuchAlgorithmException; -import java.security.cert.CertificateException; -import java.security.cert.X509Certificate; -import java.util.Map; - -import javax.net.ssl.SSLContext; -import javax.net.ssl.TrustManager; -import javax.net.ssl.X509TrustManager; - import org.apache.commons.lang3.JavaVersion; import org.apache.commons.lang3.SystemUtils; import org.apache.http.HttpException; @@ -22,28 +11,32 @@ import org.apache.http.config.SocketConfig; import org.apache.http.conn.socket.ConnectionSocketFactory; import org.apache.http.conn.socket.PlainConnectionSocketFactory; -import org.apache.http.conn.ssl.DefaultHostnameVerifier; import org.apache.http.conn.ssl.SSLConnectionSocketFactory; -import org.apache.http.impl.client.BasicCookieStore; -import org.apache.http.impl.client.CloseableHttpClient; -import org.apache.http.impl.client.DefaultHttpRequestRetryHandler; -import org.apache.http.impl.client.HttpClientBuilder; -import org.apache.http.impl.client.HttpClients; +import org.apache.http.impl.client.*; import org.apache.http.impl.conn.PoolingHttpClientConnectionManager; import org.apache.http.impl.cookie.BasicClientCookie; import org.apache.http.protocol.HttpContext; import org.slf4j.Logger; import org.slf4j.LoggerFactory; - import us.codecraft.webmagic.Site; +import javax.net.ssl.SSLContext; +import javax.net.ssl.TrustManager; +import javax.net.ssl.X509TrustManager; +import java.io.IOException; +import java.security.KeyManagementException; +import java.security.NoSuchAlgorithmException; +import java.security.cert.CertificateException; +import java.security.cert.X509Certificate; +import java.util.Map; + /** * @author code4crafter@gmail.com
* @since 0.4.0 */ public class HttpClientGenerator { - private transient Logger logger = LoggerFactory.getLogger(getClass()); + private transient Logger logger = LoggerFactory.getLogger(getClass()); private PoolingHttpClientConnectionManager connectionManager; @@ -61,21 +54,20 @@ private SSLConnectionSocketFactory buildSSLConnectionSocketFactory() { SSLContext sslContext = createIgnoreVerifySSL(); String[] supportedProtocols; if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_11)) { - supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3" }; + supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2", "TLSv1.3"}; } else { - supportedProtocols = new String[] { "SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2" }; + supportedProtocols = new String[]{"SSLv3", "TLSv1", "TLSv1.1", "TLSv1.2"}; } logger.debug("supportedProtocols: {}", String.join(", ", supportedProtocols)); return new SSLConnectionSocketFactory(sslContext, supportedProtocols, null, - new DefaultHostnameVerifier()); // 优先绕过安全证书 - } catch (KeyManagementException e) { - logger.error("ssl connection fail", e); - } catch (NoSuchAlgorithmException e) { + //不进行主机校验 + (host, sslSession) -> true); // 优先绕过安全证书 + } catch (KeyManagementException | NoSuchAlgorithmException e) { logger.error("ssl connection fail", e); } return SSLConnectionSocketFactory.getSocketFactory(); - } + } private SSLContext createIgnoreVerifySSL() throws NoSuchAlgorithmException, KeyManagementException { // 实现一个X509TrustManager接口,用于绕过验证,不用修改里面的方法 @@ -97,9 +89,9 @@ public X509Certificate[] getAcceptedIssuers() { }; SSLContext sc = SSLContext.getInstance("TLS"); - sc.init(null, new TrustManager[] { trustManager }, null); + sc.init(null, new TrustManager[]{trustManager}, null); return sc; - } + } public HttpClientGenerator setPoolSize(int poolSize) { connectionManager.setMaxTotal(poolSize); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java index 0cef4ed42..8eab4d6de 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/ProxyProvider.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.proxy; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; /** @@ -23,7 +24,23 @@ public interface ProxyProvider { * Get a proxy for task by some strategy. * @param task the download task * @return proxy + * @deprecated Use {@link #getProxy(Request, Task)} instead. */ - Proxy getProxy(Task task); + @Deprecated + default Proxy getProxy(Task task) { + throw new UnsupportedOperationException(); + } + + /** + * Returns a proxy for the request. + * + * @param request the request + * @param task the download task + * @return proxy + * @since 0.9.0 + */ + default Proxy getProxy(Request request, Task task) { + return this.getProxy(task); + } } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java index ddef6a88c..f4c3f73bb 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/proxy/SimpleProxyProvider.java @@ -1,6 +1,7 @@ package us.codecraft.webmagic.proxy; import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Task; import java.util.ArrayList; @@ -44,7 +45,7 @@ public void returnProxy(Proxy proxy, Page page, Task task) { } @Override - public Proxy getProxy(Task task) { + public Proxy getProxy(Request request, Task task) { return proxies.get(incrForLoop()); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java index b267d5ba9..6001767d8 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/BaseElementSelector.java @@ -3,6 +3,7 @@ import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; +import us.codecraft.webmagic.utils.BaseSelectorUtils; import java.util.ArrayList; import java.util.List; @@ -13,16 +14,9 @@ */ public abstract class BaseElementSelector implements Selector, ElementSelector { private Document parse(String text) { - if (text == null) { - return null; - } - // Jsoup could not parse or tag directly // https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag - if ((text.startsWith("") && text.endsWith("")) - || (text.startsWith("") && text.endsWith(""))) { - text = "" + text + "
"; - } + text = BaseSelectorUtils.preParse(text); return Jsoup.parse(text); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java new file mode 100644 index 000000000..04c0651c3 --- /dev/null +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/utils/BaseSelectorUtils.java @@ -0,0 +1,23 @@ +package us.codecraft.webmagic.utils; + +/** + * @author hooy + */ +public class BaseSelectorUtils { + + /** + * Jsoup/HtmlCleaner could not parse "tr" or "td" tag directly + * https://stackoverflow.com/questions/63607740/jsoup-couldnt-parse-tr-tag + * + * @param text - the html string + * @return text + */ + public static String preParse(String text) { + if (((text.startsWith("") || text.startsWith("")) + || ((text.startsWith("") || text.startsWith(""))) { + text = "" + text + "
"; + } + return text; + } + +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java new file mode 100644 index 000000000..783b82ddc --- /dev/null +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/SiteTest.java @@ -0,0 +1,17 @@ +package us.codecraft.webmagic; + +import static org.junit.Assert.assertEquals; + +import java.nio.charset.StandardCharsets; + +import org.junit.Test; + +public class SiteTest { + + @Test + public void test() { + Site site = Site.me().setDefaultCharset(StandardCharsets.UTF_8.name()); + assertEquals(StandardCharsets.UTF_8.name(), site.getDefaultCharset()); + } + +} diff --git a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java index 6495b16bf..e9325a7a7 100644 --- a/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java +++ b/webmagic-core/src/test/java/us/codecraft/webmagic/proxy/SimpleProxyProviderTest.java @@ -1,6 +1,9 @@ package us.codecraft.webmagic.proxy; import org.junit.Test; +import org.mockito.Mockito; + +import us.codecraft.webmagic.Request; import us.codecraft.webmagic.Site; import us.codecraft.webmagic.Task; @@ -20,11 +23,12 @@ public void test_get_proxy() throws Exception { Proxy originProxy1 = new Proxy("127.0.0.1", 1087); Proxy originProxy2 = new Proxy("127.0.0.1", 1088); SimpleProxyProvider proxyProvider = SimpleProxyProvider.from(originProxy1, originProxy2); - Proxy proxy = proxyProvider.getProxy(TASK); + Request request = Mockito.mock(Request.class); + Proxy proxy = proxyProvider.getProxy(request, TASK); assertThat(proxy).isEqualTo(originProxy1); - proxy = proxyProvider.getProxy(TASK); + proxy = proxyProvider.getProxy(request, TASK); assertThat(proxy).isEqualTo(originProxy2); - proxy = proxyProvider.getProxy(TASK); + proxy = proxyProvider.getProxy(request, TASK); assertThat(proxy).isEqualTo(originProxy1); } } diff --git a/webmagic-coverage/pom.xml b/webmagic-coverage/pom.xml index e2c0f741c..a0a5ffb48 100644 --- a/webmagic-coverage/pom.xml +++ b/webmagic-coverage/pom.xml @@ -8,7 +8,7 @@ us.codecraft webmagic-parent - 0.8.0 + 0.9.0 webmagic-coverage diff --git a/webmagic-extension/pom.xml b/webmagic-extension/pom.xml index 05d6100a6..7cf0aa617 100644 --- a/webmagic-extension/pom.xml +++ b/webmagic-extension/pom.xml @@ -3,7 +3,7 @@ us.codecraft webmagic-parent - 0.8.0 + 0.9.0 4.0.0 @@ -14,6 +14,11 @@ redis.clients jedis
+ + org.assertj + assertj-core + test + com.google.guava guava diff --git a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java index b213dda94..50dbcaf1a 100644 --- a/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java +++ b/webmagic-extension/src/main/java/us/codecraft/webmagic/monitor/SpiderMonitor.java @@ -1,21 +1,25 @@ package us.codecraft.webmagic.monitor; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import us.codecraft.webmagic.Request; -import us.codecraft.webmagic.Spider; -import us.codecraft.webmagic.SpiderListener; -import us.codecraft.webmagic.utils.Experimental; -import us.codecraft.webmagic.utils.UrlUtils; - -import javax.management.*; import java.lang.management.ManagementFactory; import java.util.ArrayList; import java.util.Collections; import java.util.List; -import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; +import javax.management.InstanceAlreadyExistsException; +import javax.management.JMException; +import javax.management.MBeanRegistrationException; +import javax.management.MBeanServer; +import javax.management.MalformedObjectNameException; +import javax.management.NotCompliantMBeanException; +import javax.management.ObjectName; + +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.SpiderListener; +import us.codecraft.webmagic.utils.Experimental; +import us.codecraft.webmagic.utils.UrlUtils; + /** * @author code4crafer@gmail.com * @since 0.5.0 @@ -23,17 +27,13 @@ @Experimental public class SpiderMonitor { - private static SpiderMonitor INSTANCE = new SpiderMonitor(); - - private AtomicBoolean started = new AtomicBoolean(false); - - private Logger logger = LoggerFactory.getLogger(getClass()); + private static final SpiderMonitor INSTANCE = new SpiderMonitor(); private MBeanServer mbeanServer; private String jmxServerName; - private List spiderStatuses = new ArrayList(); + private List spiderStatuses = new ArrayList<>(); protected SpiderMonitor() { jmxServerName = "WebMagic"; @@ -51,7 +51,7 @@ public synchronized SpiderMonitor register(Spider... spiders) throws JMException for (Spider spider : spiders) { MonitorSpiderListener monitorSpiderListener = new MonitorSpiderListener(); if (spider.getSpiderListeners() == null) { - List spiderListeners = new ArrayList(); + List spiderListeners = new ArrayList<>(); spiderListeners.add(monitorSpiderListener); spider.setSpiderListeners(spiderListeners); } else { @@ -90,7 +90,7 @@ public void onSuccess(Request request) { } @Override - public void onError(Request request) { + public void onError(Request request, Exception e) { errorUrls.add(request.getUrl()); errorCount.incrementAndGet(); } @@ -109,7 +109,6 @@ public List getErrorUrls() { } protected void registerMBean(SpiderStatusMXBean spiderStatus) throws MalformedObjectNameException, InstanceAlreadyExistsException, MBeanRegistrationException, NotCompliantMBeanException { -// ObjectName objName = new ObjectName(jmxServerName + ":name=" + spiderStatus.getName()); ObjectName objName = new ObjectName(jmxServerName + ":name=" + UrlUtils.removePort(spiderStatus.getName())); mbeanServer.registerMBean(spiderStatus, objName); } diff --git a/webmagic-samples/pom.xml b/webmagic-samples/pom.xml index 449fcf243..e42e1fcd8 100644 --- a/webmagic-samples/pom.xml +++ b/webmagic-samples/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.0 + 0.9.0 4.0.0 @@ -27,22 +27,22 @@ org.mapdb mapdb - 3.0.8 + 3.0.9 com.fasterxml.jackson.core jackson-core - 2.13.0-rc1 + 2.15.2 com.fasterxml.jackson.core jackson-annotations - 2.13.0-rc1 + 2.15.2 com.fasterxml.jackson.core jackson-databind - 2.13.4.2 + 2.15.2 diff --git a/webmagic-saxon/pom.xml b/webmagic-saxon/pom.xml index b73f6fd27..c5238760b 100644 --- a/webmagic-saxon/pom.xml +++ b/webmagic-saxon/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.0 + 0.9.0 4.0.0 diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java new file mode 100644 index 000000000..b03f3a2ab --- /dev/null +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/JaxpSelectorUtils.java @@ -0,0 +1,61 @@ +package us.codecraft.webmagic.selector; + +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +import javax.xml.transform.OutputKeys; +import javax.xml.transform.Transformer; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +/** + * @author hooy + */ +public final class JaxpSelectorUtils { + + private JaxpSelectorUtils() { + throw new RuntimeException("The util class cannot be instanced"); + } + + public static List NodeListToArrayList(NodeList nodes) { + List list = new ArrayList<>(nodes.getLength()); + for (int i = 0; i < nodes.getLength(); i++) { + list.add(nodes.item(i)); + } + return list; + } + + public static String nodeToString(Node node) throws TransformerException { + List before = Collections.singletonList(node); + List after = nodesToStrings(before); + if (after.size() > 0) { + return after.get(0); + } else { + return null; + } + } + + public static List nodesToStrings(List nodes) throws TransformerException { + List results = new ArrayList<>(nodes.size()); + Transformer transformer = TransformerFactory.newInstance().newTransformer(); + StreamResult xmlOutput = new StreamResult(); + transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); + for (Node node : nodes) { + if (node.getNodeType() == Node.ATTRIBUTE_NODE || node.getNodeType() == Node.TEXT_NODE) { + results.add(node.getTextContent()); + } else { + xmlOutput.setWriter(new StringWriter()); + transformer.transform(new DOMSource(node), xmlOutput); + results.add(xmlOutput.getWriter().toString()); + } + } + return results; + } + +} diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java new file mode 100644 index 000000000..3e6339dda --- /dev/null +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/NodeSelector.java @@ -0,0 +1,32 @@ +package us.codecraft.webmagic.selector; + +import org.w3c.dom.Node; + +import java.util.List; + +/** + * Selector(extractor) for html node.
+ * + * @author hooy
+ * @since 0.8.0 + */ +public interface NodeSelector { + + /** + * Extract single result in text.
+ * If there are more than one result, only the first will be chosen. + * + * @param node node + * @return result + */ + String select(Node node); + + /** + * Extract all results in text.
+ * + * @param node node + * @return results + */ + List selectList(Node node); + +} diff --git a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java index 9d5eef9b0..6c5d7b332 100644 --- a/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java +++ b/webmagic-saxon/src/main/java/us/codecraft/webmagic/selector/Xpath2Selector.java @@ -1,18 +1,10 @@ package us.codecraft.webmagic.selector; -import java.io.StringWriter; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.concurrent.ConcurrentHashMap; import javax.xml.namespace.NamespaceContext; -import javax.xml.transform.OutputKeys; -import javax.xml.transform.Transformer; -import javax.xml.transform.TransformerFactory; -import javax.xml.transform.dom.DOMSource; -import javax.xml.transform.stream.StreamResult; +import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; @@ -29,21 +21,24 @@ import net.sf.saxon.lib.NamespaceConstant; import net.sf.saxon.xpath.XPathEvaluator; +import us.codecraft.webmagic.utils.BaseSelectorUtils; + +import static us.codecraft.webmagic.selector.JaxpSelectorUtils.*; /** * 支持xpath2.0的选择器。包装了HtmlCleaner和Saxon HE。
* - * @author code4crafter@gmail.com
- * Date: 13-4-21 - * Time: 上午9:39 + * @author code4crafter@gmail.com, hooy
+ * Date: 13-4-21 + * Time: 上午9:39 */ -public class Xpath2Selector implements Selector { +public class Xpath2Selector implements Selector, NodeSelector { - private String xpathStr; + private final String xpathStr; private XPathExpression xPathExpression; - private Logger logger = LoggerFactory.getLogger(getClass()); + private final Logger logger = LoggerFactory.getLogger(getClass()); public Xpath2Selector(String xpathStr) { this.xpathStr = xpathStr; @@ -54,25 +49,25 @@ public Xpath2Selector(String xpathStr) { } } + public static Xpath2Selector newInstance(String xpathStr) { + return new Xpath2Selector(xpathStr); + } + enum XPath2NamespaceContext implements NamespaceContext { INSTANCE; - private final Map prefix2NamespaceMap = new ConcurrentHashMap(); + private final Map prefix2NamespaceMap = new ConcurrentHashMap<>(); - private final Map> namespace2PrefixMap = new ConcurrentHashMap>(); + private final Map> namespace2PrefixMap = new ConcurrentHashMap<>(); private void put(String prefix, String namespaceURI) { prefix2NamespaceMap.put(prefix, namespaceURI); - List prefixes = namespace2PrefixMap.get(namespaceURI); - if (prefixes == null) { - prefixes = new ArrayList(); - namespace2PrefixMap.put(namespaceURI, prefixes); - } + List prefixes = namespace2PrefixMap.computeIfAbsent(namespaceURI, k -> new ArrayList<>()); prefixes.add(prefix); } - private XPath2NamespaceContext() { + XPath2NamespaceContext() { put("fn", NamespaceConstant.FN); put("xslt", NamespaceConstant.XSLT); put("xhtml", NamespaceConstant.XHTML); @@ -111,32 +106,18 @@ private void init() throws XPathExpressionException { @Override public String select(String text) { try { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - TagNode tagNode = htmlCleaner.clean(text); - Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); - Object result; - try { - result = xPathExpression.evaluate(document, XPathConstants.NODESET); - } catch (XPathExpressionException e) { - result = xPathExpression.evaluate(document, XPathConstants.STRING); - } - if (result instanceof NodeList) { - NodeList nodeList = (NodeList) result; - if (nodeList.getLength() == 0) { - return null; - } - Node item = nodeList.item(0); - if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) { - return item.getTextContent(); - } else { - StreamResult xmlOutput = new StreamResult(new StringWriter()); - Transformer transformer = TransformerFactory.newInstance().newTransformer(); - transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); - transformer.transform(new DOMSource(item), xmlOutput); - return xmlOutput.getWriter().toString(); - } - } - return result.toString(); + Document doc = parse(text); + return select(doc); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + @Override + public String select(Node node) { + try { + return (String) xPathExpression.evaluate(node, XPathConstants.STRING); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } @@ -145,38 +126,72 @@ public String select(String text) { @Override public List selectList(String text) { - List results = new ArrayList(); try { - HtmlCleaner htmlCleaner = new HtmlCleaner(); - TagNode tagNode = htmlCleaner.clean(text); - Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode); - Object result; - try { - result = xPathExpression.evaluate(document, XPathConstants.NODESET); - } catch (XPathExpressionException e) { - result = xPathExpression.evaluate(document, XPathConstants.STRING); - } - if (result instanceof NodeList) { - NodeList nodeList = (NodeList) result; - Transformer transformer = TransformerFactory.newInstance().newTransformer(); - StreamResult xmlOutput = new StreamResult(); - transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); - for (int i = 0; i < nodeList.getLength(); i++) { - Node item = nodeList.item(i); - if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) { - results.add(item.getTextContent()); - } else { - xmlOutput.setWriter(new StringWriter()); - transformer.transform(new DOMSource(item), xmlOutput); - results.add(xmlOutput.getWriter().toString()); - } - } - } else { - results.add(result.toString()); - } + Document doc = parse(text); + return selectList(doc); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + @Override + public List selectList(Node node) { + try { + NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET); + List nodes = NodeListToArrayList(result); + return nodesToStrings(nodes); } catch (Exception e) { logger.error("select text error! " + xpathStr, e); } - return results; + return null; } + + public Node selectNode(String text) { + try { + Document doc = parse(text); + return selectNode(doc); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + public Node selectNode(Node node) { + try { + return (Node) xPathExpression.evaluate(node, XPathConstants.NODE); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + public List selectNodes(String text) { + try { + Document doc = parse(text); + return selectNodes(doc); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + public List selectNodes(Node node) { + try { + NodeList result = (NodeList) xPathExpression.evaluate(node, XPathConstants.NODESET); + return NodeListToArrayList(result); + } catch (Exception e) { + logger.error("select text error! " + xpathStr, e); + } + return null; + } + + protected static Document parse(String text) throws ParserConfigurationException { + // HtmlCleaner could not parse or tag directly + text = BaseSelectorUtils.preParse(text); + HtmlCleaner htmlCleaner = new HtmlCleaner(); + TagNode tagNode = htmlCleaner.clean(text); + return new DomSerializer(new CleanerProperties()).createDOM(tagNode); + } + } diff --git a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java index 166188361..4033fcfbd 100644 --- a/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java +++ b/webmagic-saxon/src/test/java/us/codecraft/webmagic/selector/XpathSelectorTest.java @@ -11,9 +11,15 @@ import org.junit.Ignore; import org.junit.Test; +import org.w3c.dom.Node; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Spider; +import us.codecraft.webmagic.processor.PageProcessor; import us.codecraft.xsoup.XPathEvaluator; import us.codecraft.xsoup.Xsoup; +import javax.xml.transform.TransformerException; + /** * @author code4crafter@gmail.com
Date: 13-4-21 Time: 上午10:06 */ @@ -1389,31 +1395,31 @@ public void testXpath2Selector() { @Test public void performanceTest() { Xpath2Selector xpath2Selector = new Xpath2Selector("//a"); - long time =System.currentTimeMillis(); + long time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpath2Selector.selectList(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); XpathSelector xpathSelector = new XpathSelector("//a"); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpathSelector.selectList(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { xpath2Selector.selectList(html); } System.out.println(System.currentTimeMillis() - time); CssSelector cssSelector = new CssSelector("a"); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 1000; i++) { cssSelector.selectList(html); } - System.out.println("css "+(System.currentTimeMillis()-time)); + System.out.println("css " + (System.currentTimeMillis() - time)); } @Ignore("take long time") @@ -1425,55 +1431,92 @@ public void parserPerformanceTest() throws XPatherException { TagNode tagNode = htmlCleaner.clean(html); Document document = Jsoup.parse(html); - long time =System.currentTimeMillis(); + long time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { htmlCleaner.clean(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { tagNode.evaluateXPath("//a"); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); System.out.println("============="); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { Jsoup.parse(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { document.select("a"); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); System.out.println("============="); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { htmlCleaner.clean(html); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { tagNode.evaluateXPath("//a"); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); System.out.println("============="); XPathEvaluator compile = Xsoup.compile("//a"); - time =System.currentTimeMillis(); + time = System.currentTimeMillis(); for (int i = 0; i < 2000; i++) { compile.evaluate(document); } - System.out.println(System.currentTimeMillis()-time); + System.out.println(System.currentTimeMillis() - time); } + /** + * New api test + * + * @author hooy + * @since 8.0 + */ + private String rank = "

点击榜

排名分类书名/最新章节作者推荐更新时间
1.现实
0
11-24 22:32
2.架空
1047
03-04 14:44
3.现实
0
07-20 09:06
4.豪门
0
12-03 09:12
5.现实
0
02-01 21:12
6.玄奇
3455
02-28 12:31
7.玄奇
20614
03-31 12:37
8.复仇
55
06-03 11:43
9.穿越
0
10-27 18:50
10.宫斗
320
10-31 13:58
11.宫斗
6268
07-12 20:23
12.现实
0
01-18 23:00
13.婚恋
0
12-14 20:50
14.修真
0
02-03 23:40
15.豪门
0
11-06 23:38
16.穿越
191
12-02 23:37
17.穿越
412
10-13 22:39
18.豪门
635
07-01 13:15
19.架空
144
06-18 09:35
20.宅斗
1032
08-15 19:03
21.宫斗
0
09-30 20:32
22.豪门
0
06-05 11:31
23.重生
80
11-25 19:56
24.异世
68
01-12 10:06
25.豪门
0
05-29 18:46
26.婚恋
2778
11-04 17:48
27.玄奇
207
12-06 16:57
28.穿越
260
01-04 23:26
29.豪门
0
12-07 21:39
30.架空
1127
06-06 17:28
31.穿越
113
09-13 09:06
32.架空
597
02-14 18:47
33.玄奇
528
06-04 22:04
34.穿越
328
06-06 22:09
35.架空
539
05-24 14:42
36.架空
0
03-05 23:27
37.穿越
3215
08-21 16:38
38.宫斗
905
08-04 20:24
39.玄奇
1328
07-25 10:58
40.穿越
203
01-27 20:53
41.宫斗
407
08-31 09:03
42.宅斗
16
05-03 17:38
43.豪门
0
11-10 08:00
44.婚恋
0
07-12 21:37
45.架空
0
06-23 21:02
46.玄奇
1382
05-31 20:36
47.重生
334
07-16 19:19
48.婚恋
505
11-01 16:42
49.婚恋
0
10-19 18:32
50.豪门
540
09-19 19:18
51.婚恋
226
03-18 13:09
52.穿越
1026
03-08 16:28
53.重生
304
02-19 10:25
54.玄奇
2617
02-15 20:57
55.穿越
199
09-04 19:43
56.同人
768
07-19 20:00
57.宅斗
0
02-13 18:13
58.豪门
0
11-12 22:23
59.架空
0
07-28 23:42
60.婚恋
0
02-03 23:09
61.豪门
285
01-07 19:21
62.重生
654
10-12 18:16
63.异能
617
06-18 20:23
64.宫斗
27
06-02 21:05
65.种田
206
08-31 19:23
66.宅斗
2444
08-19 15:51
67.宅斗
818
08-07 23:38
68.现代
0
12-23 17:02
69.玄奇
0
07-23 12:00
70.婚恋
0
11-01 16:43
71.豪门
0
09-12 00:01
72.架空
0
04-27 22:42
73.豪门
0
04-19 13:55
74.异能
62
07-30 00:00
75.穿越
1307
07-20 16:41
76.玄奇
12820
07-15 23:46
77.架空
828
06-06 17:54
78.宅斗
985
05-20 23:53
79.玄奇
4960
04-12 15:58
80.玄奇
245
03-02 23:11
81.宅斗
34
12-21 10:11
82.宅斗
1411
07-21 00:00
83.现代
0
07-31 10:10
84.玄奇
0
06-18 13:53
85.架空
0
12-03 23:41
86.玄奇
0
11-28 22:13
87.豪门
0
11-07 22:48
88.婚恋
0
08-29 23:15
89.种田
1831
08-21 16:38
90.豪门
0
07-11 21:25
91.豪门
0
06-13 15:37
92.豪门
0
05-07 22:10
93.豪门
0
02-28 00:01
94.豪门
304
12-16 07:30
95.婚恋
669
11-07 18:16
96.仙侠
54
09-25 19:51
97.豪门
655
08-31 13:02
98.现实
374
06-29 09:55
99.穿越
373
06-19 18:07
100.婚恋
159
06-04 21:05
"; + + @Test + public void testStringAPI() { + // testAPI: selectList(String) -> selectList(Node) + List items = new Xpath2Selector("//div[@class=\"bd\"]//tbody/tr").selectList(rank); + Assert.assertSame(100, items.size()); + // testAPI: select(String) -> select(Node) + String name = new Xpath2Selector("//td[3]/div/a[1]/text()").select(items.get(10)); + Assert.assertEquals("深宫安容传", name); + } + + @Test + public void testNodeAPI() { + // testAPI: selectNodes(String) -> selectNodes(Node) + List items = new Xpath2Selector("//div[@class=\"bd\"]//tbody/tr").selectNodes(rank); + Assert.assertSame(100, items.size()); + // testAPI: selectNode(Node) + Node item = new Xpath2Selector("./td[3]/div/a[1]").selectNode(items.get(10)); + String name = new Xpath2Selector("./text()").select(item); + Assert.assertEquals("深宫安容传", name); + } + + @Test + public void testUtilAPI() throws TransformerException { + Node item = Xpath2Selector.newInstance("//div[@class=\"bd\"]//tbody/tr[11]/td[3]/div/a[1]/text()").selectNode(rank); + // testAPI: nodeToString(Node) -> nodesToStrings(List) + String name = JaxpSelectorUtils.nodeToString(item); + Assert.assertEquals("深宫安容传", name); + } + } diff --git a/webmagic-scripts/pom.xml b/webmagic-scripts/pom.xml index 3ec15f9af..0019ea3c8 100644 --- a/webmagic-scripts/pom.xml +++ b/webmagic-scripts/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.0 + 0.9.0 4.0.0 diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 715d7731b..63682001f 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -3,7 +3,7 @@ webmagic-parent us.codecraft - 0.8.0 + 0.9.0 4.0.0