List of usage examples for org.jsoup.nodes Node toString
public String toString()
From source file:com.bibisco.manager.TextEditorManager.java
private static void parseTextNode(HtmlParsingResult pHtmlParsingResult, Node pNode) { List<String> lListWords = new ArrayList<String>(); mLog.debug("Start parseTextNode(HtmlParsingResult, Node): ", pNode.toString()); // character count String lStrNodeText = StringUtils.replace(pNode.toString(), " ", " "); lStrNodeText = StringUtils.replace(lStrNodeText, "\n", ""); lStrNodeText = StringEscapeUtils.unescapeHtml(lStrNodeText); pHtmlParsingResult.characterCount += lStrNodeText.length(); // extract words lStrNodeText = pNode.toString();// w w w . j a v a 2 s.c o m lStrNodeText = StringUtils.replace(lStrNodeText, " ", ""); lStrNodeText = StringUtils.replace(lStrNodeText, "«", ""); lStrNodeText = StringUtils.replace(lStrNodeText, "»", ""); lStrNodeText = StringUtils.replace(lStrNodeText, "—", ""); lStrNodeText = StringEscapeUtils.unescapeHtml(lStrNodeText); lStrNodeText = replaceCharIntervalWithWhiteSpace(lStrNodeText, 33, 38); lStrNodeText = replaceCharIntervalWithWhiteSpace(lStrNodeText, 40, 47); lStrNodeText = replaceCharIntervalWithWhiteSpace(lStrNodeText, 58, 64); lStrNodeText = replaceCharIntervalWithWhiteSpace(lStrNodeText, 91, 96); lStrNodeText = replaceCharIntervalWithWhiteSpace(lStrNodeText, 123, 126); lStrNodeText = replaceCharIntervalWithWhiteSpace(lStrNodeText, 161, 191); lStrNodeText = StringUtils.replaceChars(lStrNodeText, '', ' '); lStrNodeText = StringUtils.replaceChars(lStrNodeText, '', ' '); lStrNodeText = StringUtils.replaceChars(lStrNodeText, '', ' '); lStrNodeText = lStrNodeText.trim(); if (StringUtils.isNotBlank(lStrNodeText)) { StringTokenizer lStringTokenizer = new StringTokenizer(lStrNodeText); while (lStringTokenizer.hasMoreTokens()) { lListWords.add(lStringTokenizer.nextToken()); } } pHtmlParsingResult.words.addAll(lListWords); mLog.debug("End parseTextNode(HtmlParsingResult, Node)"); }
From source file:org.coronastreet.gpxconverter.GarminForm.java
private static String findFlowKey(Node node) { String key = null;/*w w w . j a va 2 s. c om*/ for (int i = 0; i < node.childNodes().size();) { Node child = node.childNode(i); if (child.nodeName().equals("#comment")) { //System.out.println(child.toString()); String flowKeyPattern = "\\<\\!-- flowExecutionKey\\: \\[(e1s1)\\] --\\>"; key = child.toString().replaceAll(flowKeyPattern, "$1").trim(); break; } else { findFlowKey(child); i++; } } return key; }
From source file:com.screenslicer.common.CommonUtil.java
private static Element sanitize(Document doc, final boolean ascii) { if (ascii) {/*w ww. j a v a 2 s .c o m*/ doc.outputSettings().charset("ascii"); } else { doc.outputSettings().charset("utf-8"); } doc.traverse(new NodeVisitor() { @Override public void tail(Node n, int d) { } @Override public void head(Node n, int d) { try { if (n.nodeName().equals("#text") && !CommonUtil.isEmpty(n.outerHtml())) { ((TextNode) n).text(HtmlCoder.decode(n.toString())); } } catch (Throwable t) { Log.exception(t); } } }); return doc; }
From source file:com.screenslicer.core.util.Util.java
public static boolean isEmpty(Node node, boolean doFilter) { return node == null || node.nodeName().equals("#comment") || node.nodeName().equals("#data") || node.nodeName().equals("style") || node.nodeName().equals("script") || isHidden(node) || (doFilter && isFiltered(node)) || (node.nodeName().equals("#text") && CommonUtil.isEmpty(node.toString(), true)); }
From source file:net.poemerchant.scraper.ShopScraper.java
public List<Buyout> scrapeItemBuyouts(int noOfItems) { buyouts = new ArrayList<Buyout>(noOfItems); for (int i = 0; i < noOfItems; i++) { Buyout buyout = Buyout.NONE;/* w w w . ja v a 2 s. com*/ Node itemElem = doc.select("#item-fragment-" + i).first(); Element itemElemNext = doc.select("#item-fragment-" + (i + 1)).first(); if (itemElem != null) { while (!itemElem.equals(itemElemNext)) { itemElem = itemElem.nextSibling(); if (itemElem == null) { // case where there is no b/o set and we've reached the end of a spoiler break; } if (Element.class.isAssignableFrom(itemElem.getClass())) continue; String boRaw = StringUtils.trim(itemElem.toString()); String[] split = StringUtils.split(boRaw); if (split.length == 3) { BuyoutMode buyoutMode = BuyoutMode.parse(split[0]); if (buyoutMode != BuyoutMode.unknown) { buyout = new Buyout(boRaw); break; } } } } else { logger.severe( "Actual item in the OP was not found. Buyout will be defaulted to NONE. Item index is " + i); } buyouts.add(buyout); } return buyouts; }
From source file:com.screenslicer.core.scrape.type.ComparableNode.java
public ComparableNode(final Node node) { this.node = node; List<Node> separated = node.childNodes(); int children = 0; int childBlocks = 0; int childFormatting = 0; int childContent = 0; int childItems = 0; int childDecoration = 0; int anchorChildren = 0; int textChildren = 0; int anchorTextChildren = 0; int anchorChildItems = 0; int textChildItems = 0; int anchorTextChildItems = 0; int itemChars = 0; int itemAnchorChars = 0; List<String> firstChildTags = null; List<List<String>> orderedTags = new ArrayList<List<String>>(); List<String> allChildTags = new ArrayList<String>(); ArrayList<List<String>> childTags = new ArrayList<List<String>>(); boolean childrenConsistent = true; String childName = null;/*from w w w.j av a2 s .c om*/ boolean childrenSame = true; double avgChildLengthDouble = 0d; int nodeStrLen = Util.trimmedLen(node.toString()); DescriptiveStatistics statAnchorChars = new DescriptiveStatistics(); DescriptiveStatistics statAnchors = new DescriptiveStatistics(); DescriptiveStatistics statChars = new DescriptiveStatistics(); DescriptiveStatistics statDescendants = new DescriptiveStatistics(); DescriptiveStatistics statFields = new DescriptiveStatistics(); DescriptiveStatistics statLevels = new DescriptiveStatistics(); DescriptiveStatistics statLongestField = new DescriptiveStatistics(); DescriptiveStatistics statNonAnchorChars = new DescriptiveStatistics(); DescriptiveStatistics statTextAnchors = new DescriptiveStatistics(); DescriptiveStatistics statStrLen = new DescriptiveStatistics(); DescriptiveStatistics statItemChars = new DescriptiveStatistics(); DescriptiveStatistics statItemAnchorChars = new DescriptiveStatistics(); for (Node child : separated) { if (!Util.isEmpty(child)) { children++; int childStrLen = Util.trimmedLen(child.toString()); avgChildLengthDouble += childStrLen; NodeCounter counter = new NodeCounter(child); if (Util.isItem(child.nodeName())) { ++childItems; anchorChildItems += counter.anchors() > 0 ? 1 : 0; textChildItems += counter.fields() > 0 ? 1 : 0; anchorTextChildItems += counter.anchors() > 0 && counter.fields() > 0 ? 1 : 0; itemChars += counter.chars(); itemAnchorChars += counter.anchorChars(); statItemChars.addValue(counter.chars()); statItemAnchorChars.addValue(counter.anchorChars()); } if (Util.isBlock(child.nodeName())) { ++childBlocks; } if (Util.isDecoration(child.nodeName())) { ++childDecoration; } if (Util.isFormatting(child.nodeName())) { ++childFormatting; } if (Util.isContent(child)) { ++childContent; } anchorChildren += counter.anchors() > 0 ? 1 : 0; textChildren += counter.fields() > 0 ? 1 : 0; anchorTextChildren += counter.anchors() > 0 && counter.fields() > 0 ? 1 : 0; statAnchorChars.addValue(counter.anchorChars()); statAnchors.addValue(counter.anchors()); statChars.addValue(counter.chars()); statDescendants.addValue(counter.descendants()); statFields.addValue(counter.fields()); statLevels.addValue(counter.levels()); statLongestField.addValue(counter.longestField()); statNonAnchorChars.addValue(counter.nonAnchorChars()); statTextAnchors.addValue(counter.textAnchors()); statStrLen.addValue(childStrLen); List<String> curChildTags = counter.tags(); allChildTags = Util.join(allChildTags, curChildTags); childTags.add(curChildTags); if (firstChildTags == null) { firstChildTags = curChildTags; } else if (childrenConsistent && !Util.isSame(firstChildTags, curChildTags)) { childrenConsistent = false; } if (childName == null) { childName = child.nodeName(); } else if (childrenSame && !childName.equals(child.nodeName())) { childrenSame = false; } if (!Util.contains(counter.orderedTags(), orderedTags)) { orderedTags.add(counter.orderedTags()); } } } avgChildLengthDouble = children == 0 ? 0 : avgChildLengthDouble / (double) children; int avgChildLength = (int) avgChildLengthDouble; double avgChildDiff = 0; int maxChildDiff = 0; for (List<String> tagList : childTags) { avgChildDiff += allChildTags.size() - tagList.size(); maxChildDiff = Math.max(maxChildDiff, allChildTags.size() - tagList.size()); } avgChildDiff = childTags.size() == 0 ? 0 : avgChildDiff / (double) childTags.size(); childrenConsistent = firstChildTags != null && !firstChildTags.isEmpty() && childrenConsistent; NodeCounter counter = new NodeCounter(separated); int siblings = 0; for (Node sibling : node.parent().childNodes()) { if (!Util.isEmpty(sibling)) { siblings++; } } this.scores = new int[] { counter.items(), counter.blocks(), counter.decoration(), counter.formatting(), counter.content(), div(counter.items(), children), div(counter.blocks(), children), div(counter.decoration(), children), div(counter.formatting(), children), div(counter.content(), children), childItems, childBlocks, childDecoration, childFormatting, childContent, avgChildLength, counter.fields(), textChildItems, counter.images(), counter.anchors(), counter.textAnchors(), div(counter.chars(), Math.max(1, counter.fields())), div(itemChars, Math.max(1, textChildItems)), counter.longestField(), nodeStrLen, div(nodeStrLen, children), counter.anchorLen(), counter.chars(), itemChars, div(counter.chars(), children), div(itemChars, childItems), counter.nonAnchorChars(), div(counter.nonAnchorChars(), children), div(counter.nonAnchorChars(), childItems), div(counter.nonAnchorChars(), childBlocks), div(counter.nonAnchorChars(), childContent), div(counter.nonAnchorChars(), counter.anchors()), div(counter.nonAnchorChars(), counter.textAnchors()), counter.anchorChars(), itemAnchorChars, div(itemAnchorChars, anchorChildItems), div(counter.anchorChars(), counter.anchors()), div(counter.anchorChars(), counter.textAnchors()), div(counter.anchorChars(), children), counter.descendants(), counter.levels(), div(counter.descendants(), children), div(children, counter.levels()), siblings, children, maxChildDiff, toInt(avgChildDiff), toInt(childrenSame), toInt(childrenConsistent), orderedTags.size(), mod0(children, RESULT_GROUP_LARGE), mod0(children, RESULT_GROUP_SMALL), distance(children, RESULT_GROUP_LARGE), distance(children, RESULT_GROUP_SMALL), mod0(childItems, RESULT_GROUP_LARGE), mod0(childItems, RESULT_GROUP_SMALL), distance(childItems, RESULT_GROUP_LARGE), distance(childItems, RESULT_GROUP_SMALL), mod0(childBlocks, RESULT_GROUP_LARGE), mod0(childBlocks, RESULT_GROUP_SMALL), distance(childBlocks, RESULT_GROUP_LARGE), distance(childBlocks, RESULT_GROUP_SMALL), mod0(childContent, RESULT_GROUP_LARGE), mod0(childContent, RESULT_GROUP_SMALL), distance(childContent, RESULT_GROUP_LARGE), distance(childContent, RESULT_GROUP_SMALL), mod0(counter.anchors(), RESULT_GROUP_LARGE), mod0(counter.anchors(), RESULT_GROUP_SMALL), distance(counter.anchors(), RESULT_GROUP_LARGE), distance(counter.anchors(), RESULT_GROUP_SMALL), mod0(anchorChildItems, RESULT_GROUP_LARGE), mod0(anchorChildItems, RESULT_GROUP_SMALL), distance(anchorChildItems, RESULT_GROUP_LARGE), distance(anchorChildItems, RESULT_GROUP_SMALL), mod0(textChildItems, RESULT_GROUP_LARGE), mod0(textChildItems, RESULT_GROUP_SMALL), distance(textChildItems, RESULT_GROUP_LARGE), distance(textChildItems, RESULT_GROUP_SMALL), mod0(counter.textAnchors(), RESULT_GROUP_LARGE), mod0(counter.textAnchors(), RESULT_GROUP_SMALL), distance(counter.textAnchors(), RESULT_GROUP_LARGE), distance(counter.textAnchors(), RESULT_GROUP_SMALL), Math.abs(children - counter.anchors()), Math.abs(childItems - counter.anchors()), evenlyDivisible(children, counter.anchors()), evenlyDivisible(childItems, counter.anchors()), smallestMod(children, counter.anchors()), smallestMod(childItems, counter.anchors()), Math.abs(children - counter.textAnchors()), Math.abs(childItems - counter.textAnchors()), Math.abs(children - anchorChildren), Math.abs(childItems - anchorChildItems), Math.abs(children - textChildren), Math.abs(childItems - textChildItems), Math.abs(children - anchorTextChildren), Math.abs(childItems - anchorTextChildItems), evenlyDivisible(children, counter.textAnchors()), evenlyDivisible(childItems, counter.textAnchors()), evenlyDivisible(children, anchorChildren), evenlyDivisible(childItems, anchorChildItems), evenlyDivisible(children, textChildren), evenlyDivisible(childItems, textChildItems), evenlyDivisible(children, anchorTextChildren), evenlyDivisible(childItems, anchorTextChildItems), smallestMod(children, counter.textAnchors()), smallestMod(childItems, counter.textAnchors()), smallestMod(children, anchorChildren), smallestMod(childItems, anchorChildItems), smallestMod(children, textChildren), smallestMod(childItems, textChildItems), smallestMod(children, anchorTextChildren), smallestMod(childItems, anchorTextChildItems), Math.abs(anchorChildren - anchorChildItems), Math.abs(textChildren - textChildItems), Math.abs(anchorTextChildren - anchorTextChildItems), toInt(statAnchorChars.getSkewness()), toInt(statAnchorChars.getStandardDeviation()), toInt(statAnchorChars.getMean()), toInt(statAnchors.getSkewness()), toInt(statAnchors.getStandardDeviation()), toInt(statAnchors.getMean()), toInt(statChars.getSkewness()), toInt(statChars.getStandardDeviation()), toInt(statChars.getMean()), toInt(statDescendants.getSkewness()), toInt(statDescendants.getStandardDeviation()), toInt(statDescendants.getMean()), toInt(statFields.getSkewness()), toInt(statFields.getStandardDeviation()), toInt(statFields.getMean()), toInt(statLevels.getSkewness()), toInt(statLevels.getStandardDeviation()), toInt(statLevels.getMean()), toInt(statLongestField.getSkewness()), toInt(statLongestField.getStandardDeviation()), toInt(statLongestField.getMean()), toInt(statNonAnchorChars.getSkewness()), toInt(statNonAnchorChars.getStandardDeviation()), toInt(statNonAnchorChars.getMean()), toInt(statStrLen.getSkewness()), toInt(statStrLen.getStandardDeviation()), toInt(statStrLen.getMean()), toInt(statTextAnchors.getSkewness()), toInt(statTextAnchors.getStandardDeviation()), toInt(statTextAnchors.getMean()), toInt(statItemChars.getSkewness()), toInt(statItemChars.getStandardDeviation()), toInt(statItemChars.getMean()), toInt(statItemAnchorChars.getSkewness()), toInt(statItemAnchorChars.getStandardDeviation()), toInt(statItemAnchorChars.getMean()), }; }
From source file:org.dswarm.xmlenhancer.XMLEnhancer.java
private static void unescapeEntity(final PrintWriter out, final Node node) { node.ownerDocument().outputSettings().escapeMode(Entities.EscapeMode.xhtml) .syntax(Document.OutputSettings.Syntax.xml).prettyPrint(false); if (node instanceof TextNode) { final TextNode textNode = (TextNode) node; final String wholeText = textNode.getWholeText(); out.print(wholeText);/*from ww w.ja v a2 s.c o m*/ return; } final String nodeString = node.toString(); final String unescapedNodeString = Parser.unescapeEntities(nodeString, true); out.print(unescapedNodeString); }
From source file:org.dswarm.xmlenhancer.XMLEnhancer.java
private static void enhanceTextNode(final Node node) { final TextNode textNode = (TextNode) node; final String wholeText = textNode.getWholeText(); final String text = node.toString(); if (text.trim().isEmpty()) { return;/*from w ww. j av a 2 s .c o m*/ } if (wholeText.startsWith(START_CDATA)) { // do not add CDATA multiple times return; } final String alignedText = alignTextWithWholeText(wholeText, text); final String unescapeEntities = String.format("%s%s%s", START_CDATA, alignedText, END_CDATA); textNode.text(unescapeEntities); }
From source file:org.jsweet.input.typescriptdef.visitor.DocFiller.java
@Override public void head(Node node, int depth) { if (node instanceof TextNode) { String text = ((TextNode) node).text().trim(); if (members.containsKey(text)) { Node parent = node.parent(); if (parent != null && !"td".equals(parent.nodeName())) { parent = parent.parent(); }//w w w . j a va2 s . c o m if (parent != null && !"td".equals(parent.nodeName())) { parent = parent.parent(); } if (parent != null && "td".equals(parent.nodeName())) { List<Node> siblings = parent.parent().childNodes(); List<Node> tdSiblings = new ArrayList<Node>(); siblings.forEach(n -> { if ("td".equals(n.nodeName())) tdSiblings.add(n); }); if (tdSiblings.get(0) == parent && tdSiblings.size() == 3) { Node td = tdSiblings.get(2); String s = td.toString(); String doc = "/** " + DocFiller.removeTags(s.substring(4, s.length() - 5)) + " */"; for (Declaration d : members.get(text)) { d.setDocumentation(doc); } docFiller.countDoc(false); } } } } }