From 36b25fac7c56695da936a58677a330a5315a26bc Mon Sep 17 00:00:00 2001 From: "xuefeng.cai" Date: Sun, 3 Dec 2017 19:29:04 +0800 Subject: [PATCH 1/2] =?UTF-8?q?ISSUE#688=20=E5=A2=9E=E5=8A=A0=E4=BA=86Chro?= =?UTF-8?q?meDriverDownloader=20=E5=8F=96=E4=BB=A3SeleniumDownloader?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- webmagic-selenium/pom.xml | 7 +- .../codecraft/webmagic/downloader/.DS_Store | Bin 0 -> 6148 bytes .../selenium/ChromeDriverDownloader.java | 147 ++++++++++++++++++ .../selenium/ScreenShotServiceImpl.java | 90 +++++++++++ .../selenium/ChromeDriverDownloaderTest.java | 58 +++++++ .../selenium/SeleniumDownloaderTest.java | 1 + 6 files changed, 302 insertions(+), 1 deletion(-) create mode 100644 webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/.DS_Store create mode 100644 webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/ChromeDriverDownloader.java create mode 100644 webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/ScreenShotServiceImpl.java create mode 100644 webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/ChromeDriverDownloaderTest.java diff --git a/webmagic-selenium/pom.xml b/webmagic-selenium/pom.xml index 1cbf59216..68afd3d9e 100644 --- a/webmagic-selenium/pom.xml +++ b/webmagic-selenium/pom.xml @@ -32,7 +32,12 @@ junit junit - + + org.apache.commons + commons-pool2 + 2.4.2 + + diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/.DS_Store b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..b4becf4757e1512e4accae4dac99940f84f98e14 GIT binary patch literal 6148 zcmeHK%Sr=55Ukc57QE!>ael!+7()Dl{D22Vgk%LJp7-Q;<XKrW#gx@)GV zYnH9U_BH@p?VlfjC4hnMh;I*b^KU!anm~>bTA68GcnoumB&ih-G!+N5k6p#YP z3f$&)>HYta{?Gh>OwvvYNP(+Tz!vMxdc{|&-a30Z@3oEoME9Bl-Hq#@Fhn~hMmy%l f+wpxAWnJ?%&wJsN7mri^msa2hNOKjm literal 0 HcmV?d00001 diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/ChromeDriverDownloader.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/ChromeDriverDownloader.java new file mode 100644 index 000000000..94be3dd09 --- /dev/null +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/ChromeDriverDownloader.java @@ -0,0 +1,147 @@ +package us.codecraft.webmagic.downloader.selenium; + +import java.io.Closeable; +import java.io.IOException; +import java.util.Map; +import org.apache.commons.pool2.BasePooledObjectFactory; +import org.apache.commons.pool2.PooledObject; +import org.apache.commons.pool2.impl.DefaultPooledObject; +import org.apache.commons.pool2.impl.GenericObjectPool; +import org.apache.commons.pool2.impl.GenericObjectPoolConfig; +import org.apache.log4j.Logger; +import org.openqa.selenium.By; +import org.openqa.selenium.Cookie; +import org.openqa.selenium.WebDriver; +import org.openqa.selenium.WebElement; +import org.openqa.selenium.chrome.ChromeDriver; +import org.openqa.selenium.chrome.ChromeOptions; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; +import us.codecraft.webmagic.downloader.Downloader; +import us.codecraft.webmagic.selector.Html; +import us.codecraft.webmagic.selector.PlainText; + +/** + * 类似于SeleniumDownloader,但是只用ChromeDriver
+ * ChromeDriver下载地址:http://chromedriver.storage.googleapis.com/index.html + * chrome浏览器版本与chromeDriver驱动包版本是要注意 + * + * @author Stephen Cai + * @date: 2017-12-03 18:31 + */ +public class ChromeDriverDownloader implements Downloader, Closeable { + + private GenericObjectPool pool; + + private ChromeOptions options; + + private Logger logger = Logger.getLogger(getClass()); + + private int sleepTime = 0; + + private int poolSize = 1; + + private static final String DRIVER_PHANTOMJS = "phantomjs"; + + /** + * 新建 + * + * @param chromeDriverPath chromeDriverPath + */ + public ChromeDriverDownloader(String chromeDriverPath) { + System.getProperties().setProperty("webdriver.chrome.driver", + chromeDriverPath); + options = new ChromeOptions(); + GenericObjectPoolConfig poolConfig = new GenericObjectPoolConfig(); + poolConfig.setMaxTotal(5); + poolConfig.setMaxIdle(5); + poolConfig.setMinIdle(1); + pool = new GenericObjectPool( + new BasePooledObjectFactory() { + @Override + public ChromeDriver create() throws Exception { + return new ChromeDriver(options); + } + + @Override + public PooledObject wrap(final ChromeDriver chromeDriver) { + return new DefaultPooledObject(chromeDriver) { + @Override + public synchronized void invalidate() { + chromeDriver.quit(); + super.invalidate(); + } + }; + } + }, poolConfig); + } + + + /** + * set sleep time to wait until load success + * + * @param sleepTime sleepTime + * @return this + */ + public ChromeDriverDownloader setSleepTime(int sleepTime) { + this.sleepTime = sleepTime; + return this; + } + + @Override + public Page download(Request request, Task task) { + ChromeDriver chromeDriver; + try { + chromeDriver = pool.borrowObject(); + } catch (Exception e) { + logger.error("get from pool error", e); + return null; + } + logger.info("downloading page " + request.getUrl()); + chromeDriver.get(request.getUrl()); + try { + Thread.sleep(sleepTime); + } catch (InterruptedException e) { + e.printStackTrace(); + } + WebDriver.Options manage = chromeDriver.manage(); + Site site = task.getSite(); + if (site.getCookies() != null) { + for (Map.Entry cookieEntry : site.getCookies() + .entrySet()) { + Cookie cookie = new Cookie(cookieEntry.getKey(), + cookieEntry.getValue()); + manage.addCookie(cookie); + } + } + + /* + * TODO You can add mouse event or other processes + * + * @author: bob.li.0718@gmail.com + */ + + WebElement webElement = chromeDriver.findElement(By.xpath("/html")); + String content = webElement.getAttribute("outerHTML"); + Page page = new Page(); + page.setRawText(content); + page.setHtml(new Html(content, request.getUrl())); + page.setUrl(new PlainText(request.getUrl())); + page.setRequest(request); + pool.returnObject(chromeDriver); + return page; + } + + + @Override + public void setThread(int thread) { + this.poolSize = thread; + } + + @Override + public void close() throws IOException { + pool.close(); + } +} diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/ScreenShotServiceImpl.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/ScreenShotServiceImpl.java new file mode 100644 index 000000000..cdc532954 --- /dev/null +++ b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/ScreenShotServiceImpl.java @@ -0,0 +1,90 @@ +package us.codecraft.webmagic.downloader.selenium; + +/** + * @author code4crafter@gmail.com + * Date: 2017/9/12 + * Time: 下午10:57 + */ +public class ScreenShotServiceImpl { + // + //private GenericObjectPool pool; + // + //private ChromeOptions options; + // + // + //@PreDestroy + //public void destroy(){ + // pool.close(); + //} + // + //@PostConstruct + //public void init(){ + // System.getProperties().setProperty("webdriver.chrome.driver", + // "/usr/share/chromedriver"); + // options = new ChromeOptions(); + // options.addArguments("--headless","--disable-gpu"); + // options.setExperimentalOption("mobileEmulation", + // ImmutableMap.builder().put("deviceMetrics", + // ImmutableMap.builder().put("width", 360).put("height", 640).put("pixelRatio", 3).build()) + // .build()); + // GenericObjectPoolConfig poolConfig = new GenericObjectPoolConfig(); + // poolConfig.setMaxTotal(5); + // poolConfig.setMaxIdle(5); + // poolConfig.setMinIdle(1); + // pool = new GenericObjectPool( + // new BasePooledObjectFactory() { + // @Override + // public ChromeDriver create() throws Exception { + // return new ChromeDriver(options); + // } + // + // @Override + // public PooledObject wrap(ChromeDriver chromeDriver) { + // return new DefaultPooledObject(chromeDriver) { + // @Override + // public synchronized void invalidate() { + // chromeDriver.quit(); + // super.invalidate(); + // } + // }; + // } + // }, poolConfig); + // + //} + // + //@Override + //public byte[] getMobileScreenShot(String url) { + // ChromeDriver driver; + // try { + // driver = pool.borrowObject(); + // } catch (Exception e) { + // log.error("get from pool error", e); + // return new byte[] {}; + // } + // try { + // driver.get(url); + // long width = (Long) driver.executeScript( + // "return Math.max(document.body.scrollWidth, document.body.offsetWidth, document.documentElement.clientWidth, document.documentElement.scrollWidth, document.documentElement.offsetWidth);"); + // long height = (Long) driver.executeScript( + // "return Math.max(document.body.scrollHeight, document.body.offsetHeight, document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight);"); + // driver.manage().window().setSize(new Dimension((int) width + 100, (int) height + 100)); + // byte[] bytes = driver.getScreenshotAs(OutputType.BYTES); + // BufferedImage img = null; + // try { + // img = ImageIO.read(new ByteArrayInputStream(bytes)); + // BufferedImage newBufferedImage = new BufferedImage(img.getWidth(), + // img.getHeight(), BufferedImage.TYPE_INT_RGB); + // newBufferedImage.createGraphics().drawImage(img, 0, 0, Color.WHITE, null); + // ByteArrayOutputStream output = new ByteArrayOutputStream(); + // ImageIO.write(newBufferedImage, "jpg", output); + // log.info("get data bytes {}", bytes.length); + // return output.toByteArray(); + // } catch (IOException e) { + // log.error("convert image error {}", bytes.length, e); + // } + // return bytes; + // } finally { + // pool.returnObject(driver); + // } + //} +} diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/ChromeDriverDownloaderTest.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/ChromeDriverDownloaderTest.java new file mode 100644 index 000000000..2206a4dbd --- /dev/null +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/ChromeDriverDownloaderTest.java @@ -0,0 +1,58 @@ +package us.codecraft.webmagic.downloader.selenium; + +import org.junit.Ignore; +import org.junit.Test; +import us.codecraft.webmagic.Page; +import us.codecraft.webmagic.Request; +import us.codecraft.webmagic.Site; +import us.codecraft.webmagic.Task; + +/** + * @author Stephen Cai + * @date 2017-12-03 18:33 + */ +public class ChromeDriverDownloaderTest { + private String chromeDriverPath = "/Users/caifeng/Downloads/chromedriver"; + + @Ignore("need chrome driver") + @Test + public void test() { + ChromeDriverDownloader chromeDriverDownloader = new ChromeDriverDownloader(chromeDriverPath); + long time1 = System.currentTimeMillis(); + for (int i = 0; i < 100; i++) { + Page page = chromeDriverDownloader.download(new Request("http://huaban.com/"), new Task() { + @Override + public String getUUID() { + return "huaban.com"; + } + + @Override + public Site getSite() { + return Site.me(); + } + }); + System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all()); + } + System.out.println(System.currentTimeMillis() - time1); + } + + @Ignore + @Test + public void testBaiduWenku() { + ChromeDriverDownloader chromeDriverDownloader = new ChromeDriverDownloader(chromeDriverPath); + chromeDriverDownloader.setSleepTime(10000); + long time1 = System.currentTimeMillis(); + Page page = chromeDriverDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() { + @Override + public String getUUID() { + return "huaban.com"; + } + + @Override + public Site getSite() { + return Site.me(); + } + }); + System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>","").replace("&nsbp;","").all()); + } +} diff --git a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java index 2b8c24711..df2a7f044 100644 --- a/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java +++ b/webmagic-selenium/src/test/java/us/codecraft/webmagic/downloader/selenium/SeleniumDownloaderTest.java @@ -19,6 +19,7 @@ public class SeleniumDownloaderTest { @Ignore("need chrome driver") @Test public void test() { + System.getProperties().put("selenuim_config", "config.ini"); SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath); long time1 = System.currentTimeMillis(); for (int i = 0; i < 100; i++) { From 56bf87825f19ad12cf66f3b92d5bce67e981e1e6 Mon Sep 17 00:00:00 2001 From: "xuefeng.cai" Date: Mon, 4 Dec 2017 10:35:56 +0800 Subject: [PATCH 2/2] delete test --- .../selenium/ScreenShotServiceImpl.java | 90 ------------------- 1 file changed, 90 deletions(-) delete mode 100644 webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/ScreenShotServiceImpl.java diff --git a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/ScreenShotServiceImpl.java b/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/ScreenShotServiceImpl.java deleted file mode 100644 index cdc532954..000000000 --- a/webmagic-selenium/src/main/java/us/codecraft/webmagic/downloader/selenium/ScreenShotServiceImpl.java +++ /dev/null @@ -1,90 +0,0 @@ -package us.codecraft.webmagic.downloader.selenium; - -/** - * @author code4crafter@gmail.com - * Date: 2017/9/12 - * Time: 下午10:57 - */ -public class ScreenShotServiceImpl { - // - //private GenericObjectPool pool; - // - //private ChromeOptions options; - // - // - //@PreDestroy - //public void destroy(){ - // pool.close(); - //} - // - //@PostConstruct - //public void init(){ - // System.getProperties().setProperty("webdriver.chrome.driver", - // "/usr/share/chromedriver"); - // options = new ChromeOptions(); - // options.addArguments("--headless","--disable-gpu"); - // options.setExperimentalOption("mobileEmulation", - // ImmutableMap.builder().put("deviceMetrics", - // ImmutableMap.builder().put("width", 360).put("height", 640).put("pixelRatio", 3).build()) - // .build()); - // GenericObjectPoolConfig poolConfig = new GenericObjectPoolConfig(); - // poolConfig.setMaxTotal(5); - // poolConfig.setMaxIdle(5); - // poolConfig.setMinIdle(1); - // pool = new GenericObjectPool( - // new BasePooledObjectFactory() { - // @Override - // public ChromeDriver create() throws Exception { - // return new ChromeDriver(options); - // } - // - // @Override - // public PooledObject wrap(ChromeDriver chromeDriver) { - // return new DefaultPooledObject(chromeDriver) { - // @Override - // public synchronized void invalidate() { - // chromeDriver.quit(); - // super.invalidate(); - // } - // }; - // } - // }, poolConfig); - // - //} - // - //@Override - //public byte[] getMobileScreenShot(String url) { - // ChromeDriver driver; - // try { - // driver = pool.borrowObject(); - // } catch (Exception e) { - // log.error("get from pool error", e); - // return new byte[] {}; - // } - // try { - // driver.get(url); - // long width = (Long) driver.executeScript( - // "return Math.max(document.body.scrollWidth, document.body.offsetWidth, document.documentElement.clientWidth, document.documentElement.scrollWidth, document.documentElement.offsetWidth);"); - // long height = (Long) driver.executeScript( - // "return Math.max(document.body.scrollHeight, document.body.offsetHeight, document.documentElement.clientHeight, document.documentElement.scrollHeight, document.documentElement.offsetHeight);"); - // driver.manage().window().setSize(new Dimension((int) width + 100, (int) height + 100)); - // byte[] bytes = driver.getScreenshotAs(OutputType.BYTES); - // BufferedImage img = null; - // try { - // img = ImageIO.read(new ByteArrayInputStream(bytes)); - // BufferedImage newBufferedImage = new BufferedImage(img.getWidth(), - // img.getHeight(), BufferedImage.TYPE_INT_RGB); - // newBufferedImage.createGraphics().drawImage(img, 0, 0, Color.WHITE, null); - // ByteArrayOutputStream output = new ByteArrayOutputStream(); - // ImageIO.write(newBufferedImage, "jpg", output); - // log.info("get data bytes {}", bytes.length); - // return output.toByteArray(); - // } catch (IOException e) { - // log.error("convert image error {}", bytes.length, e); - // } - // return bytes; - // } finally { - // pool.returnObject(driver); - // } - //} -}