Java tutorial
/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.aliuge.crawler.extractor.selector; import java.text.DecimalFormat; import java.text.NumberFormat; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.aliuge.crawler.exception.ExtractException; import org.aliuge.crawler.extractor.selector.action.IntegerSelectorAction; import org.aliuge.crawler.extractor.selector.action.SelectorAction; import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.math.NumberUtils; import org.jsoup.select.Elements; import com.google.common.collect.Lists; /** * @author chenxinwen * @date 201481 * @desc */ public class NumericaElementCssSelector extends AbstractElementCssSelector<Number> { NumberFormat format; Number content; private List<IntegerSelectorAction> actions = Lists.newArrayList(); public NumericaElementCssSelector() { super(); }; public NumericaElementCssSelector(String name, String value, String attr, boolean isRequired, int index, String regex) { super(name, value, attr, isRequired, index, regex); } public NumericaElementCssSelector(String name, String value, String attr, boolean isRequired, String parttern) { super(name, value, attr, isRequired, 0); format = new DecimalFormat(parttern); } @Override public Number getContent() throws ExtractException { try { // content???document2+?? if (null != content && !newDoc) { return content; } if (null != document) { Elements elements = super.document.select(value); if (elements.isEmpty()) return null; String temp; switch ($Attr) { case text: //temp = CharMatcher.DIGIT.retainFrom(getExtractText(elements)); temp = filterDIGITFromString(getExtractText(elements)); break; default: temp = filterDIGITFromString(getExtractAttr(elements, attr)); break; } if (StringUtils.isNotBlank(temp)) { content = NumberUtils.createNumber(temp); newDoc = false; return content; } } } catch (Exception e) { e.printStackTrace(); throw new ExtractException("????:" + e.getMessage()); } return null; } @Override public Map<String, Number> getContentMap() throws ExtractException { if (newDoc) { this.content = getContent(); } if (null == this.content) return null; Map<String, Number> m = new HashMap<String, Number>(1); m.put(name, this.content); return m; } private String filterDIGITFromString(String text) { Pattern pp = Pattern.compile("\\d+\\.?\\d*"); Matcher mm = pp.matcher(text); if (mm.find()) { String temp = mm.group(); if (temp.endsWith(".")) { temp = temp.substring(0, temp.length() - 1); } return temp; } return null; } /** * * @return */ public String getContentString() throws ExtractException { if (null == content && newDoc) { getContent(); } return format.format(this.content); } public NumberFormat getFormat() { return format; } public void setFormat(NumberFormat format) { this.format = format; } public static void main(String[] args) { String str = "\"8\""; System.out.println(new NumericaElementCssSelector().filterDIGITFromString(str)); } @SuppressWarnings("unchecked") @Override public void addAction(SelectorAction action) { this.actions.add((IntegerSelectorAction) action); } }