org.sbs.goodcrawler.extractor.selector.AbstractElementCssSelector.java Source code

Java tutorial

Introduction

Here is the source code for org.sbs.goodcrawler.extractor.selector.AbstractElementCssSelector.java

Source

/**
 * ########################  SHENBAISE'S WORK  ##########################
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.sbs.goodcrawler.extractor.selector;

import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.sbs.goodcrawler.exception.ExtractException;
import org.sbs.goodcrawler.extractor.GCElement;
import org.sbs.goodcrawler.extractor.selector.action.SelectorAction;

/**
 * @author whiteme
 * @param <T>
 * @date 20131010
 * @desc ???css|xpath???required.</br>
 * <b>??documentdocumentsetDocument
 */
@SuppressWarnings("rawtypes")
public abstract class AbstractElementCssSelector<T> implements GCElement {
    /**
     * ??
     */
    protected String name;
    /**
     * css selector
     */
    protected String value;
    /**
     * img?src?text
     */
    protected String attr;
    /**
     * attrSelectorAttr
     */
    protected SelectorAttr $Attr;
    /**
     * ?required
     */
    protected boolean isRequired;

    protected Pattern pattern = null;
    /**
     * ??
     */
    protected Document document;
    /**
     * true??document<br>
     * falsedocument??
     */
    protected boolean newDoc = true;

    /**
     * ???.
     */
    protected int index = 0;
    /**
     * elementaction
     */
    //   protected List<SelectorAction> actions;

    /**
     * 
     */
    public AbstractElementCssSelector() {
    };

    /**
     * 
     * @param name
     * @param value
     * @param atrr
     * @param isRequired
     * @param document
     */
    public AbstractElementCssSelector(String name, String value, String attr, boolean isRequired, int index,
            String regex) {
        super();
        this.name = name;
        this.value = value;
        this.attr = attr;
        this.$Attr = org.apache.commons.lang3.EnumUtils.getEnum(SelectorAttr.class, this.attr);
        if (this.$Attr == null) {
            this.$Attr = SelectorAttr.other;
        }
        this.isRequired = isRequired;
        this.index = index;
        if (StringUtils.isNotBlank(regex))
            this.pattern = Pattern.compile(regex);
    }

    /**
     * @param name
     * @param value
     * @param attr
     * @param isRequired
     * @param index
     */
    public AbstractElementCssSelector(String name, String value, String attr, boolean isRequired, int index) {
        super();
        this.name = name;
        this.value = value;
        this.attr = attr;
        this.$Attr = org.apache.commons.lang3.EnumUtils.getEnum(SelectorAttr.class, this.attr);
        this.isRequired = isRequired;
        this.index = index;
    }

    /**
     * 
     * @param name
     * @param value
     * @param attr
     * @param isRequired
     */
    public AbstractElementCssSelector(String name, String value, String attr, boolean isRequired) {
        super();
        this.name = name;
        this.value = value;
        this.attr = attr;
        this.$Attr = org.apache.commons.lang3.EnumUtils.getEnum(SelectorAttr.class, this.attr);
        this.isRequired = isRequired;
    }

    /**
     * ??
     * @return
     */
    public abstract T getContent() throws ExtractException;

    /**
     * ??k???namev?
     * @return
     */
    public abstract Map<String, T> getContentMap() throws ExtractException;

    public abstract void addAction(SelectorAction action);

    public String getName() {
        return name;
    }

    public AbstractElementCssSelector setName(String name) {
        this.name = name;
        return this;
    }

    public int getIndex() {
        return index;
    }

    public void setIndex(int index) {
        this.index = index;
    }

    public String getValue() {
        return value;
    }

    public AbstractElementCssSelector setValue(String value) {
        this.value = value;
        return this;
    }

    public String getAttr() {
        return attr;
    }

    public AbstractElementCssSelector setAttr(String attr) {
        this.attr = attr;
        return this;
    }

    public boolean isRequired() {
        return isRequired;
    }

    public AbstractElementCssSelector setRequired(boolean isRequired) {
        this.isRequired = isRequired;
        return this;
    }

    public Document getDocument() {
        return document;
    }

    public AbstractElementCssSelector setDocument(Document document) {
        this.document = document;
        this.newDoc = true;
        return this;
    }

    public SelectorAttr get$Attr() {
        return $Attr;
    }

    public AbstractElementCssSelector set$Attr(SelectorAttr $Attr) {
        this.$Attr = $Attr;
        return this;
    }

    public AbstractElementCssSelector setNewDoc(boolean newDoc) {
        this.newDoc = newDoc;
        return this;
    }

    /**
     * ????document?
     */
    protected void isNewDoc() {
        this.newDoc = true;
    }

    /**
     * ????
     * @param elements
     * @return
     */
    protected String getExtractText(Elements elements) {
        if (elements.size() == 0)
            return "";
        String temp = "";

        if (attr.equals("tostring")) {
            if (index == 0 || index > elements.size())
                temp = elements.first().toString();
            else
                temp = elements.get(index).toString();
        } else {
            if (index == 0 || index > elements.size())
                temp = elements.first().text();
            else
                temp = elements.get(index).text();
        }

        if (null != pattern) {
            Matcher m = pattern.matcher(temp);
            if (m.find()) {
                temp = m.group(1);
            }
        }
        return temp;
    }

    /**
     * ??????
     * @param elements
     * @param attr
     * @return
     */
    protected String getExtractAttr(Elements elements, String attr) {
        String temp = "";
        if (attr.equals("tostring")) {
            if (index == 0 || index > elements.size())
                temp = elements.first().toString();
            else
                temp = elements.get(index).toString();
        } else {
            if (index == 0 || index > elements.size())
                temp = elements.first().attr(attr);
            else
                temp = elements.get(index).attr(attr);
        }
        if (null != pattern) {
            Matcher m = pattern.matcher(temp);
            if (m.find()) {
                temp = m.group(1);
            }
        }
        return temp;
    }

}