diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 1cbf59216..68afd3d9e 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -32,7 +32,12 @@ junit junit - + + org.apache.commons + commons-pool2 + 2.4.2 + + diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/.DS_Store b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/.DS_Store new file mode 100644 index 000000000..b4becf475 Binary files /dev/null and b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/.DS_Store differ diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/ChromeDriverDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/ChromeDriverDownloader.java new file mode 100644 index 000000000..94be3dd09 --- /dev/null +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/ChromeDriverDownloader.java @@ -0,0 +1,147 @@ +package us.codecraft.webmagic.downloader.selenium; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Map; +import org.apache.commons.pool2.BasePooledObjectFactory; +import org.apache.commons.pool2.PooledObject; +import org.apache.commons.pool2.impl.DefaultPooledObject; +import org.apache.commons.pool2.impl.GenericObjectPool; +import org.apache.commons.pool2.impl.GenericObjectPoolConfig; +import org.apache.log4j.Logger; +import org.openqa.selenium.By; +import org.openqa.selenium.Cookie; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebElement; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.downloader.Downloader; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.PlainText; + +/** + * 类似于SeleniumDownloader,但是只用ChromeDriver
+ * ChromeDriver下载地址:http://chromedriver.storage.googleapis.com/index.html + * chrome浏览器版本与chromeDriver驱动包版本是要注意 + * + * @author Stephen Cai + * @date: 2017-12-03 18:31 + */ +public class ChromeDriverDownloader implements Downloader, Closeable { + + private GenericObjectPool pool; + + private ChromeOptions options; + + private Logger logger = Logger.getLogger(getClass()); + + private int sleepTime = 0; + + private int poolSize = 1; + + private static final String DRIVER_PHANTOMJS = "phantomjs"; + + /** + * 新建 + * + * @param chromeDriverPath chromeDriverPath + */ + public ChromeDriverDownloader(String chromeDriverPath) { + System.getProperties().setProperty("webdriver.chrome.driver", + chromeDriverPath); + options = new ChromeOptions(); + GenericObjectPoolConfig poolConfig = new GenericObjectPoolConfig(); + poolConfig.setMaxTotal(5); + poolConfig.setMaxIdle(5); + poolConfig.setMinIdle(1); + pool = new GenericObjectPool( + new BasePooledObjectFactory() { + @Override + public ChromeDriver create() throws Exception { + return new ChromeDriver(options); + } + + @Override + public PooledObject wrap(final ChromeDriver chromeDriver) { + return new DefaultPooledObject(chromeDriver) { + @Override + public synchronized void invalidate() { + chromeDriver.quit(); + super.invalidate(); + } + }; + } + }, poolConfig); + } + + + /** + * set sleep time to wait until load success + * + * @param sleepTime sleepTime + * @return this + */ + public ChromeDriverDownloader setSleepTime(int sleepTime) { + this.sleepTime = sleepTime; + return this; + } + + @Override + public Page download(Request request, Task task) { + ChromeDriver chromeDriver; + try { + chromeDriver = pool.borrowObject(); + } catch (Exception e) { + logger.error("get from pool error", e); + return null; + } + logger.info("downloading page " + request.getUrl()); + chromeDriver.get(request.getUrl()); + try { + Thread.sleep(sleepTime); + } catch (InterruptedException e) { + e.printStackTrace(); + } + WebDriver.Options manage = chromeDriver.manage(); + Site site = task.getSite(); + if (site.getCookies() != null) { + for (Map.Entry cookieEntry : site.getCookies() + .entrySet()) { + Cookie cookie = new Cookie(cookieEntry.getKey(), + cookieEntry.getValue()); + manage.addCookie(cookie); + } + } + + /* + * TODO You can add mouse event or other processes + * + * @author: bob.li.0718@gmail.com + */ + + WebElement webElement = chromeDriver.findElement(By.xpath("/html")); + String content = webElement.getAttribute("outerHTML"); + Page page = new Page(); + page.setRawText(content); + page.setHtml(new Html(content, request.getUrl())); + page.setUrl(new PlainText(request.getUrl())); + page.setRequest(request); + pool.returnObject(chromeDriver); + return page; + } + + + @Override + public void setThread(int thread) { + this.poolSize = thread; + } + + @Override + public void close() throws IOException { + pool.close(); + } +} diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/ChromeDriverDownloaderTest.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/ChromeDriverDownloaderTest.java new file mode 100644 index 000000000..2206a4dbd --- /dev/null +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/ChromeDriverDownloaderTest.java @@ -0,0 +1,58 @@ +package us.codecraft.webmagic.downloader.selenium; + +import org.junit.Ignore; +import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; + +/** + * @author Stephen Cai + * @date 2017-12-03 18:33 + */ +public class ChromeDriverDownloaderTest { + private String chromeDriverPath = "/Users/caifeng/Downloads/chromedriver"; + + @Ignore("need chrome driver") + @Test + public void test() { + ChromeDriverDownloader chromeDriverDownloader = new ChromeDriverDownloader(chromeDriverPath); + long time1 = System.currentTimeMillis(); + for (int i = 0; i < 100; i++) { + Page page = chromeDriverDownloader.download(new Request("http://huaban.com/"), new Task() { + @Override + public String getUUID() { + return "huaban.com"; + } + + @Override + public Site getSite() { + return Site.me(); + } + }); + System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all()); + } + System.out.println(System.currentTimeMillis() - time1); + } + + @Ignore + @Test + public void testBaiduWenku() { + ChromeDriverDownloader chromeDriverDownloader = new ChromeDriverDownloader(chromeDriverPath); + chromeDriverDownloader.setSleepTime(10000); + long time1 = System.currentTimeMillis(); + Page page = chromeDriverDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() { + @Override + public String getUUID() { + return "huaban.com"; + } + + @Override + public Site getSite() { + return Site.me(); + } + }); + System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>","").replace("&nsbp;","").all()); + } +} diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java index 2b8c24711..df2a7f044 100644 --- a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java @@ -19,6 +19,7 @@ public class SeleniumDownloaderTest { @Ignore("need chrome driver") @Test public void test() { + System.getProperties().put("selenuim_config", "config.ini"); SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); long time1 = System.currentTimeMillis(); for (int i = 0; i < 100; i++) {