Java tutorial
/** * ######################## SHENBAISE'S WORK ########################## * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.sbs.goodcrawler.extractor.selector; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang3.StringUtils; import org.jsoup.nodes.Document; import org.jsoup.select.Elements; import org.sbs.goodcrawler.exception.ExtractException; import org.sbs.goodcrawler.extractor.GCElement; import org.sbs.goodcrawler.extractor.selector.action.SelectorAction; /** * @author whiteme * @param <T> * @date 20131010 * @desc ???css|xpath???required.</br> * <b>??documentdocumentsetDocument */ @SuppressWarnings("rawtypes") public abstract class AbstractElementCssSelector<T> implements GCElement { /** * ?? */ protected String name; /** * css selector */ protected String value; /** * img?src?text */ protected String attr; /** * attrSelectorAttr */ protected SelectorAttr $Attr; /** * ?required */ protected boolean isRequired; protected Pattern pattern = null; /** * ?? */ protected Document document; /** * true??document<br> * falsedocument?? */ protected boolean newDoc = true; /** * ???. */ protected int index = 0; /** * elementaction */ // protected List<SelectorAction> actions; /** * */ public AbstractElementCssSelector() { }; /** * * @param name * @param value * @param atrr * @param isRequired * @param document */ public AbstractElementCssSelector(String name, String value, String attr, boolean isRequired, int index, String regex) { super(); this.name = name; this.value = value; this.attr = attr; this.$Attr = org.apache.commons.lang3.EnumUtils.getEnum(SelectorAttr.class, this.attr); if (this.$Attr == null) { this.$Attr = SelectorAttr.other; } this.isRequired = isRequired; this.index = index; if (StringUtils.isNotBlank(regex)) this.pattern = Pattern.compile(regex); } /** * @param name * @param value * @param attr * @param isRequired * @param index */ public AbstractElementCssSelector(String name, String value, String attr, boolean isRequired, int index) { super(); this.name = name; this.value = value; this.attr = attr; this.$Attr = org.apache.commons.lang3.EnumUtils.getEnum(SelectorAttr.class, this.attr); this.isRequired = isRequired; this.index = index; } /** * * @param name * @param value * @param attr * @param isRequired */ public AbstractElementCssSelector(String name, String value, String attr, boolean isRequired) { super(); this.name = name; this.value = value; this.attr = attr; this.$Attr = org.apache.commons.lang3.EnumUtils.getEnum(SelectorAttr.class, this.attr); this.isRequired = isRequired; } /** * ?? * @return */ public abstract T getContent() throws ExtractException; /** * ??k???namev? * @return */ public abstract Map<String, T> getContentMap() throws ExtractException; public abstract void addAction(SelectorAction action); public String getName() { return name; } public AbstractElementCssSelector setName(String name) { this.name = name; return this; } public int getIndex() { return index; } public void setIndex(int index) { this.index = index; } public String getValue() { return value; } public AbstractElementCssSelector setValue(String value) { this.value = value; return this; } public String getAttr() { return attr; } public AbstractElementCssSelector setAttr(String attr) { this.attr = attr; return this; } public boolean isRequired() { return isRequired; } public AbstractElementCssSelector setRequired(boolean isRequired) { this.isRequired = isRequired; return this; } public Document getDocument() { return document; } public AbstractElementCssSelector setDocument(Document document) { this.document = document; this.newDoc = true; return this; } public SelectorAttr get$Attr() { return $Attr; } public AbstractElementCssSelector set$Attr(SelectorAttr $Attr) { this.$Attr = $Attr; return this; } public AbstractElementCssSelector setNewDoc(boolean newDoc) { this.newDoc = newDoc; return this; } /** * ????document? */ protected void isNewDoc() { this.newDoc = true; } /** * ???? * @param elements * @return */ protected String getExtractText(Elements elements) { if (elements.size() == 0) return ""; String temp = ""; if (attr.equals("tostring")) { if (index == 0 || index > elements.size()) temp = elements.first().toString(); else temp = elements.get(index).toString(); } else { if (index == 0 || index > elements.size()) temp = elements.first().text(); else temp = elements.get(index).text(); } if (null != pattern) { Matcher m = pattern.matcher(temp); if (m.find()) { temp = m.group(1); } } return temp; } /** * ?????? * @param elements * @param attr * @return */ protected String getExtractAttr(Elements elements, String attr) { String temp = ""; if (attr.equals("tostring")) { if (index == 0 || index > elements.size()) temp = elements.first().toString(); else temp = elements.get(index).toString(); } else { if (index == 0 || index > elements.size()) temp = elements.first().attr(attr); else temp = elements.get(index).attr(attr); } if (null != pattern) { Matcher m = pattern.matcher(temp); if (m.find()) { temp = m.group(1); } } return temp; } }