Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions webmagic-extension/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@
<artifactId>webmagic-extension</artifactId>

<dependencies>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.32</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>redis.clients</groupId>
<artifactId>jedis</artifactId>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,23 +1,27 @@
package us.codecraft.webmagic.model;

import lombok.Getter;
import lombok.Setter;
import us.codecraft.webmagic.selector.Selector;

/**
* The object contains 'ExtractBy' information.
* @author code4crafter@gmail.com <br>
* @since 0.2.0
*/
class Extractor {
public class Extractor {

@Getter @Setter
protected Selector selector;

@Getter
protected final Source source;

protected final boolean notNull;

protected final boolean multi;

static enum Source {Html, Url, RawHtml, RawText}
public static enum Source {Html, Url, RawHtml, RawText}

public Extractor(Selector selector, Source source, boolean notNull, boolean multi) {
this.selector = selector;
Expand All @@ -26,23 +30,11 @@ public Extractor(Selector selector, Source source, boolean notNull, boolean mult
this.multi = multi;
}

Selector getSelector() {
return selector;
}

Source getSource() {
return source;
}

boolean isNotNull() {
public boolean isNotNull() {
return notNull;
}

boolean isMulti() {
public boolean isMulti() {
return multi;
}

void setSelector(Selector selector) {
this.selector = selector;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,53 +6,27 @@
import java.lang.reflect.Field;
import java.lang.reflect.Method;

import lombok.Getter;
import lombok.Setter;

/**
* Wrapper of field and extractor.
* @author code4crafter@gmail.com <br>
* @since 0.2.0
*/
class FieldExtractor extends Extractor {
public class FieldExtractor extends Extractor {

@Getter
private final Field field;

@Getter @Setter
private Method setterMethod;

@Getter @Setter
private ObjectFormatter objectFormatter;

public FieldExtractor(Field field, Selector selector, Source source, boolean notNull, boolean multi) {
super(selector, source, notNull, multi);
this.field = field;
}

Field getField() {
return field;
}

Selector getSelector() {
return selector;
}

Source getSource() {
return source;
}

void setSetterMethod(Method setterMethod) {
this.setterMethod = setterMethod;
}

Method getSetterMethod() {
return setterMethod;
}

boolean isNotNull() {
return notNull;
}

ObjectFormatter getObjectFormatter() {
return objectFormatter;
}

void setObjectFormatter(ObjectFormatter objectFormatter) {
this.objectFormatter = objectFormatter;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,21 @@
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import lombok.Getter;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.model.annotation.*;
import us.codecraft.webmagic.model.formatter.ObjectFormatter;
import us.codecraft.webmagic.model.fields.PageField;
import us.codecraft.webmagic.model.formatter.ObjectFormatterBuilder;
import us.codecraft.webmagic.model.selections.MultipleSelection;
import us.codecraft.webmagic.model.selections.Selection;
import us.codecraft.webmagic.model.selections.SingleSelection;
import us.codecraft.webmagic.selector.*;
import us.codecraft.webmagic.utils.ClassUtils;
import us.codecraft.webmagic.utils.ExtractorUtils;

import java.lang.annotation.Annotation;
import java.lang.reflect.Field;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.List;
Expand All @@ -29,14 +33,19 @@
*/
class PageModelExtractor {

@Getter
private List<Pattern> targetUrlPatterns = new ArrayList<Pattern>();

@Getter
private Selector targetUrlRegionSelector;

@Getter
private List<Pattern> helpUrlPatterns = new ArrayList<Pattern>();

@Getter
private Selector helpUrlRegionSelector;

@Getter
private Class clazz;

private List<FieldExtractor> fieldExtractors;
Expand Down Expand Up @@ -233,145 +242,16 @@ private Object processSingle(Page page, String html, boolean isRaw) {
try {
o = clazz.newInstance();
for (FieldExtractor fieldExtractor : fieldExtractors) {
if (fieldExtractor.isMulti()) {
List<String> value=getMultiValueFromSource(page, fieldExtractor, html, isRaw);
if ((value == null || value.size() == 0) && fieldExtractor.isNotNull()) {
return null;
}
if (fieldExtractor.getObjectFormatter() != null) {
List<Object> converted = convertMultiValue(value, fieldExtractor.getObjectFormatter());
setField(o, fieldExtractor, converted);
} else {
setField(o, fieldExtractor, value);
}
} else {
String value=getSingleValueFromSource(page, fieldExtractor, html, isRaw);
if (value == null && fieldExtractor.isNotNull()) {
return null;
}
if (fieldExtractor.getObjectFormatter() != null) {
Object converted = convertSingleValue(value, fieldExtractor.getObjectFormatter());
if (converted == null && fieldExtractor.isNotNull()) {
return null;
}
setField(o, fieldExtractor, converted);
} else {
setField(o, fieldExtractor, value);
}
}
Selection selection = fieldExtractor.isMulti() ? new MultipleSelection() : new SingleSelection();
PageField field = selection.extractField(page, html, isRaw, fieldExtractor);
if (!field.operation(o, fieldExtractor, logger))
return null;
}
if (AfterExtractor.class.isAssignableFrom(clazz)) {
if (AfterExtractor.class.isAssignableFrom(clazz))
((AfterExtractor) o).afterProcess(page);
}
} catch (InstantiationException e) {
logger.error("extract fail", e);
} catch (IllegalAccessException e) {
logger.error("extract fail", e);
} catch (InvocationTargetException e) {
} catch (Exception e) {
logger.error("extract fail", e);
}
return o;
}

private List<String> getMultiValueFromSource(Page page, FieldExtractor fieldExtractor, String html, boolean isRaw) {
List<String> value;
switch (fieldExtractor.getSource()) {
case RawHtml:
value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
break;
case Html:
if (isRaw) {
value = page.getHtml().selectDocumentForList(fieldExtractor.getSelector());
} else {
value = fieldExtractor.getSelector().selectList(html);
}
break;
case Url:
value = fieldExtractor.getSelector().selectList(page.getUrl().toString());
break;
case RawText:
value = fieldExtractor.getSelector().selectList(page.getRawText());
break;
default:
value = fieldExtractor.getSelector().selectList(html);
}
return value;
}

private String getSingleValueFromSource(Page page, FieldExtractor fieldExtractor, String html, boolean isRaw) {
String value;
switch (fieldExtractor.getSource()) {
case RawHtml:
value = page.getHtml().selectDocument(fieldExtractor.getSelector());
break;
case Html:
if (isRaw) {
value = page.getHtml().selectDocument(fieldExtractor.getSelector());
} else {
value = fieldExtractor.getSelector().select(html);
}
break;
case Url:
value = fieldExtractor.getSelector().select(page.getUrl().toString());
break;
case RawText:
value = fieldExtractor.getSelector().select(page.getRawText());
break;
default:
value = fieldExtractor.getSelector().select(html);
}
return value;
}

private Object convertSingleValue(String value, ObjectFormatter objectFormatter) {
try {
Object format = objectFormatter.format(value);
logger.debug("String {} is converted to {}", value, format);
return format;
} catch (Exception e) {
logger.error("convert " + value + " to " + objectFormatter.clazz() + " error!", e);
}
return null;
}

private List<Object> convertMultiValue(List<String> values, ObjectFormatter objectFormatter) {
List<Object> objects = new ArrayList<Object>();
for (String value : values) {
Object converted = convertSingleValue(value, objectFormatter);
if (converted != null) {
objects.add(converted);
}
}
return objects;
}

private void setField(Object o, FieldExtractor fieldExtractor, Object value) throws IllegalAccessException, InvocationTargetException {
if (value == null) {
return;
}
if (fieldExtractor.getSetterMethod() != null) {
fieldExtractor.getSetterMethod().invoke(o, value);
}
fieldExtractor.getField().set(o, value);
}

Class getClazz() {
return clazz;
}

List<Pattern> getTargetUrlPatterns() {
return targetUrlPatterns;
}

List<Pattern> getHelpUrlPatterns() {
return helpUrlPatterns;
}

Selector getTargetUrlRegionSelector() {
return targetUrlRegionSelector;
}

Selector getHelpUrlRegionSelector() {
return helpUrlRegionSelector;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
package us.codecraft.webmagic.model.fields;

import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
import java.util.List;

import org.slf4j.Logger;

import lombok.Getter;
import us.codecraft.webmagic.model.FieldExtractor;
import us.codecraft.webmagic.model.formatter.ObjectFormatter;

public class MultipleField extends PageField {
@Getter
private List<String> fieldNames;

public MultipleField(List<String> fieldNames) {
this.fieldNames = fieldNames;
}

public boolean operation(Object o, FieldExtractor fieldExtractor, Logger logger) throws IllegalAccessException, InvocationTargetException {
if ((this.fieldNames == null || this.fieldNames.size() == 0) && fieldExtractor.isNotNull())
return false;
if (fieldExtractor.getObjectFormatter() != null) {
List<Object> converted = this.convert(this.fieldNames, fieldExtractor.getObjectFormatter(), logger);
setField(o, fieldExtractor, converted);
}
else
setField(o, fieldExtractor, this.fieldNames);
return true;
}

private List<Object> convert(List<String> values, ObjectFormatter objectFormatter, Logger logger) {
List<Object> objects = new ArrayList<>();
for (String value : values) {
Object converted = this.convert(value, objectFormatter, logger);
if (converted != null)
objects.add(converted);
}
return objects;
}
}
Loading