List of usage examples for org.jsoup.select NodeTraversor NodeTraversor
public NodeTraversor(NodeVisitor visitor)
From source file:com.trackplus.track.util.html.Html2LaTeX.java
/** * Format an Element to LaTeX/*from w w w .j a v a2 s. c o m*/ * * @param element * the root element to format * @return formatted text */ public static String getLatexText(Element element) { FormattingVisitor formatter = new FormattingVisitor(); NodeTraversor traversor = new NodeTraversor(formatter); traversor.traverse(element); // walk the DOM, and call .head() and // .tail() for each node return formatter.toString(); }
From source file:com.twentyn.patentExtractor.PatentDocument.java
private static List<String> extractTextFromHTML(DocumentBuilder docBuilder, NodeList textNodes) throws ParserConfigurationException, TransformerConfigurationException, TransformerException, XPathExpressionException { List<String> allTextList = new ArrayList<>(0); if (textNodes != null) { for (int i = 0; i < textNodes.getLength(); i++) { Node n = textNodes.item(i); /* This extremely around-the-horn approach to handling text content is due to the mix of HTML and * XML in the patent body. We use Jsoup to parse the HTML entities we find in the body, and use * its extremely convenient NodeVisitor API to recursively traverse the document and extract the * text content in reasonable chunks. *///w ww . j ava 2s. c om Document contentsDoc = Util.nodeToDocument(docBuilder, "body", n); String docText = Util.documentToString(contentsDoc); // With help from http://stackoverflow.com/questions/832620/stripping-html-tags-in-java org.jsoup.nodes.Document htmlDoc = Jsoup.parse(docText); HtmlVisitor visitor = new HtmlVisitor(); NodeTraversor traversor = new NodeTraversor(visitor); traversor.traverse(htmlDoc); List<String> textSegments = visitor.getTextContent(); allTextList.addAll(textSegments); } } return allTextList; }
From source file:no.kantega.publishing.admin.content.htmlfilter.RemoveNestedSpanTagsFilter.java
private void copySafeNodes(Element source, Element dest) { CleaningVisitor cleaningVisitor = new CleaningVisitor(dest); NodeTraversor traversor = new NodeTraversor(cleaningVisitor); traversor.traverse(source);// w ww .j ava 2 s . c o m }
From source file:org.apache.jmeter.protocol.http.parser.JsoupBasedHtmlParser.java
@Override public Iterator<URL> getEmbeddedResourceURLs(String userAgent, byte[] html, URL baseUrl, URLCollection coll, String encoding) throws HTMLParseException { try {//from w w w . j a v a 2s .c om // TODO Handle conditional comments for IE String contents = new String(html, encoding); Document doc = Jsoup.parse(contents); JMeterNodeVisitor nodeVisitor = new JMeterNodeVisitor(new URLPointer(baseUrl), coll); new NodeTraversor(nodeVisitor).traverse(doc); return coll.iterator(); } catch (Exception e) { throw new HTMLParseException(e); } }
From source file:org.jsweet.input.typescriptdef.visitor.DocFiller.java
@Override public void visitTypeDeclaration(TypeDeclaration typeDeclaration) { if (typeDeclaration.getDocumentation() == null) { String moduleName = getCurrentModuleName(); if (JSweetDefTranslatorConfig.LANG_PACKAGE.equals(moduleName) || JSweetDefTranslatorConfig.DOM_PACKAGE.equals(moduleName)) { this.currentModule = moduleName; String content = getTypeContent(context.cacheDir, "mdn", moduleName, typeDeclaration.getName()); if (content != null) { try { Document doc = Jsoup.parse(content, "UTF-8"); NodeTraversor traversor; traversor = new NodeTraversor(new MdnTableFormatGrabber(this, typeDeclaration)); traversor.traverse(doc.body()); traversor = new NodeTraversor(new MdnDefinitionListFormatGrabber(this, typeDeclaration)); traversor.traverse(doc.body()); traversor = new NodeTraversor(new MdnMainDescriptionGrabber(this, typeDeclaration)); traversor.traverse(doc.body()); } catch (Throwable t) { context.reportError("cannot fill documentation for " + context.getTypeName(typeDeclaration), typeDeclaration.getToken(), t); }// w w w . j a v a 2 s. c om } } } }
From source file:org.jsweet.input.typescriptdef.visitor.DocFiller.java
public static String removeTags(String html, String[] tagsToBeRemoved) { StringBuilder sb = new StringBuilder(); NodeTraversor traversor = new NodeTraversor(new TagRemover(sb, tagsToBeRemoved)); traversor.traverse(Jsoup.parse(html).body()); return sb.toString().replace("<p></p>", ""); }