List of usage examples for org.jsoup.select NodeTraversor traverse
public void traverse(Node root)
From source file:com.trackplus.track.util.html.Html2LaTeX.java
/** * Format an Element to LaTeX/*from w ww . j a va 2 s . com*/ * * @param element * the root element to format * @return formatted text */ public static String getLatexText(Element element) { FormattingVisitor formatter = new FormattingVisitor(); NodeTraversor traversor = new NodeTraversor(formatter); traversor.traverse(element); // walk the DOM, and call .head() and // .tail() for each node return formatter.toString(); }
From source file:com.twentyn.patentExtractor.PatentDocument.java
private static List<String> extractTextFromHTML(DocumentBuilder docBuilder, NodeList textNodes) throws ParserConfigurationException, TransformerConfigurationException, TransformerException, XPathExpressionException { List<String> allTextList = new ArrayList<>(0); if (textNodes != null) { for (int i = 0; i < textNodes.getLength(); i++) { Node n = textNodes.item(i); /* This extremely around-the-horn approach to handling text content is due to the mix of HTML and * XML in the patent body. We use Jsoup to parse the HTML entities we find in the body, and use * its extremely convenient NodeVisitor API to recursively traverse the document and extract the * text content in reasonable chunks. *///from w ww . j a va 2s.c om Document contentsDoc = Util.nodeToDocument(docBuilder, "body", n); String docText = Util.documentToString(contentsDoc); // With help from http://stackoverflow.com/questions/832620/stripping-html-tags-in-java org.jsoup.nodes.Document htmlDoc = Jsoup.parse(docText); HtmlVisitor visitor = new HtmlVisitor(); NodeTraversor traversor = new NodeTraversor(visitor); traversor.traverse(htmlDoc); List<String> textSegments = visitor.getTextContent(); allTextList.addAll(textSegments); } } return allTextList; }
From source file:no.kantega.publishing.admin.content.htmlfilter.RemoveNestedSpanTagsFilter.java
private void copySafeNodes(Element source, Element dest) { CleaningVisitor cleaningVisitor = new CleaningVisitor(dest); NodeTraversor traversor = new NodeTraversor(cleaningVisitor); traversor.traverse(source); }
From source file:org.jsweet.input.typescriptdef.visitor.DocFiller.java
@Override public void visitTypeDeclaration(TypeDeclaration typeDeclaration) { if (typeDeclaration.getDocumentation() == null) { String moduleName = getCurrentModuleName(); if (JSweetDefTranslatorConfig.LANG_PACKAGE.equals(moduleName) || JSweetDefTranslatorConfig.DOM_PACKAGE.equals(moduleName)) { this.currentModule = moduleName; String content = getTypeContent(context.cacheDir, "mdn", moduleName, typeDeclaration.getName()); if (content != null) { try { Document doc = Jsoup.parse(content, "UTF-8"); NodeTraversor traversor; traversor = new NodeTraversor(new MdnTableFormatGrabber(this, typeDeclaration)); traversor.traverse(doc.body()); traversor = new NodeTraversor(new MdnDefinitionListFormatGrabber(this, typeDeclaration)); traversor.traverse(doc.body()); traversor = new NodeTraversor(new MdnMainDescriptionGrabber(this, typeDeclaration)); traversor.traverse(doc.body()); } catch (Throwable t) { context.reportError("cannot fill documentation for " + context.getTypeName(typeDeclaration), typeDeclaration.getToken(), t); }/*w ww . j a va 2 s. c o m*/ } } } }
From source file:org.jsweet.input.typescriptdef.visitor.DocFiller.java
public static String removeTags(String html, String[] tagsToBeRemoved) { StringBuilder sb = new StringBuilder(); NodeTraversor traversor = new NodeTraversor(new TagRemover(sb, tagsToBeRemoved)); traversor.traverse(Jsoup.parse(html).body()); return sb.toString().replace("<p></p>", ""); }