diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java index 85ff5fa69..74ea718e5 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/HtmlNode.java @@ -31,6 +31,11 @@ public Selectable smartContent() { return select(smartContentSelector, getSourceTexts()); } + public Selectable smartContent(int threshold) { + SmartContentSelector smartContentSelector = Selectors.smartContent(threshold); + return select(smartContentSelector, getSourceTexts()); + } + @Override public Selectable links() { return selectElements(new LinksSelector()); diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java index 7cd68c1d6..3600896e2 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/Selectors.java @@ -20,6 +20,10 @@ public static SmartContentSelector smartContent() { return new SmartContentSelector(); } + public static SmartContentSelector smartContent(int threshold) { + return new SmartContentSelector(threshold); + } + public static CssSelector $(String expr) { return new CssSelector(expr); } diff --git a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java index ff8e26998..c8816510b 100644 --- a/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java +++ b/webmagic-core/src/main/java/us/codecraft/webmagic/selector/SmartContentSelector.java @@ -16,9 +16,15 @@ @Experimental public class SmartContentSelector implements Selector { + private int threshold = 86; + public SmartContentSelector() { } + public SmartContentSelector(int threshold) { + this.threshold = threshold; + } + @Override public String select(String html) { html = html.replaceAll("(?is)", ""); @@ -29,7 +35,6 @@ public String select(String html) { html = html.replaceAll("(?is)<.*?>", ""); List lines; int blocksWidth =3; - int threshold =86; int start; int end; StringBuilder text = new StringBuilder();