Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

import lombok.Getter;
import lombok.Setter;

import us.codecraft.webmagic.model.sources.Source;
import us.codecraft.webmagic.selector.Selector;

/**
Expand All @@ -20,9 +22,7 @@ public class Extractor {
protected final boolean notNull;

protected final boolean multi;

public static enum Source {Html, Url, RawHtml, RawText}


public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
this.selector = selector;
this.source = source;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package us.codecraft.webmagic.model;

import us.codecraft.webmagic.model.formatter.ObjectFormatter;
import us.codecraft.webmagic.model.sources.Source;
import us.codecraft.webmagic.selector.Selector;

import java.lang.reflect.Field;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,9 @@
import us.codecraft.webmagic.model.annotation.*;
import us.codecraft.webmagic.model.fields.PageField;
import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder;
import us.codecraft.webmagic.model.selections.MultipleSelection;
import us.codecraft.webmagic.model.selections.Selection;
import us.codecraft.webmagic.model.selections.SingleSelection;
import us.codecraft.webmagic.model.sources.Source;
import us.codecraft.webmagic.model.sources.SourceTextExtractor;
import us.codecraft.webmagic.model.sources.Source.*;
import us.codecraft.webmagic.selector.*;
import us.codecraft.webmagic.utils.ClassUtils;
import us.codecraft.webmagic.utils.ExtractorUtils;
Expand Down Expand Up @@ -95,7 +95,7 @@ private FieldExtractor getAnnotationExtractByUrl(Class clazz, Field field) {
regexPattern = ".*";
}
fieldExtractor = new FieldExtractor(field,
new RegexSelector(regexPattern), FieldExtractor.Source.Url, extractByUrl.notNull(),
new RegexSelector(regexPattern), new Url(), extractByUrl.notNull(),
extractByUrl.multi() || List.class.isAssignableFrom(field.getType()));
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
Expand All @@ -121,7 +121,7 @@ private FieldExtractor getAnnotationExtractCombo(Class clazz, Field field) {
default:
selector = new AndSelector(ExtractorUtils.getSelectors(extractBies));
}
fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? FieldExtractor.Source.RawHtml : FieldExtractor.Source.Html,
fieldExtractor = new FieldExtractor(field, selector, comboExtract.source() == ComboExtract.Source.RawHtml ? new RawHtml() : new SelectedHtml(),
comboExtract.notNull(), comboExtract.multi() || List.class.isAssignableFrom(field.getType()));
Method setterMethod = getSetterMethod(clazz, field);
if (setterMethod != null) {
Expand All @@ -136,26 +136,23 @@ private FieldExtractor getAnnotationExtractBy(Class clazz, Field field) {
ExtractBy extractBy = field.getAnnotation(ExtractBy.class);
if (extractBy != null) {
Selector selector = ExtractorUtils.getSelector(extractBy);
ExtractBy.Source source0 = extractBy.source();
if (extractBy.type()== ExtractBy.Type.JsonPath){
source0 = RawText;
}
FieldExtractor.Source source = null;
switch (source0){
ExtractBy.Source extractSource = extractBy.source();
if (extractBy.type()== ExtractBy.Type.JsonPath)
extractSource = RawText;
Source source = null;
switch (extractSource) {
case RawText:
source = FieldExtractor.Source.RawText;
source = new RawText();
break;
case RawHtml:
source = FieldExtractor.Source.RawHtml;
source = new RawHtml();
break;
case SelectedHtml:
source =FieldExtractor.Source.Html;
source = new SelectedHtml();
break;
default:
source =FieldExtractor.Source.Html;

source = new SelectedHtml();
}

fieldExtractor = new FieldExtractor(field, selector, source,
extractBy.notNull(), List.class.isAssignableFrom(field.getType()));
fieldExtractor.setSetterMethod(getSetterMethod(clazz, field));
Expand Down Expand Up @@ -202,7 +199,7 @@ private void initClassExtractors() {
annotation = clazz.getAnnotation(ExtractBy.class);
if (annotation != null) {
ExtractBy extractBy = (ExtractBy) annotation;
objectExtractor = new Extractor(new XpathSelector(extractBy.value()), Extractor.Source.Html, extractBy.notNull(), extractBy.multi());
objectExtractor = new Extractor(new XpathSelector(extractBy.value()), new SelectedHtml(), extractBy.notNull(), extractBy.multi());
}
}

Expand Down Expand Up @@ -242,8 +239,7 @@ private Object processSingle(Page page, String html, boolean isRaw) {
try {
o = clazz.newInstance();
for (FieldExtractor fieldExtractor : fieldExtractors) {
Selection selection = fieldExtractor.isMulti() ? new MultipleSelection() : new SingleSelection();
PageField field = selection.extractField(page, html, isRaw, fieldExtractor);
PageField field = SourceTextExtractor.getText(page, html, isRaw, fieldExtractor);
if (!field.operation(o, fieldExtractor, logger))
return null;
}
Expand Down

This file was deleted.

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
package us.codecraft.webmagic.model.sources;

import java.util.List;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.FieldExtractor;

public interface Source {
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);
public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor);

public class RawHtml implements Source {
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return page.getHtml().selectDocument(fieldExtractor.getSelector());
}

public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
}
}

public class SelectedHtml implements Source {
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
if (isRaw)
return page.getHtml().selectDocument(fieldExtractor.getSelector());
else
return fieldExtractor.getSelector().select(html);
}

public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
if (isRaw)
return page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
else
return fieldExtractor.getSelector().selectList(html);
}
}

public class Url implements Source {
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return fieldExtractor.getSelector().select(page.getUrl().toString());
}

public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return fieldExtractor.getSelector().selectList(page.getUrl().toString());
}
}

public class RawText implements Source {
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return fieldExtractor.getSelector().select(page.getRawText());
}

public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return fieldExtractor.getSelector().selectList(page.getRawText());
}
}

public class DefaultSource implements Source {
public String getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return fieldExtractor.getSelector().select(html);
}

public List<String> getTextList(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
return fieldExtractor.getSelector().selectList(html);
}
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package us.codecraft.webmagic.model.sources;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.FieldExtractor;
import us.codecraft.webmagic.model.fields.MultipleField;
import us.codecraft.webmagic.model.fields.PageField;
import us.codecraft.webmagic.model.fields.SingleField;

public class SourceTextExtractor {
public static PageField getText(Page page, String html, boolean isRaw, FieldExtractor fieldExtractor) {
Source source = fieldExtractor.getSource();
if (fieldExtractor.isMulti())
return new MultipleField(source.getTextList(page, html, isRaw, fieldExtractor));
else
return new SingleField(source.getText(page, html, isRaw, fieldExtractor));
}
}