Example usage for org.jsoup.select NodeTraversor NodeTraversor

List of usage examples for org.jsoup.select NodeTraversor NodeTraversor

Introduction

In this page you can find the example usage for org.jsoup.select NodeTraversor NodeTraversor.

Prototype

public NodeTraversor(NodeVisitor visitor) 

Source Link

Document

Create a new traversor.

Usage

From source file:com.trackplus.track.util.html.Html2LaTeX.java

/**
 * Format an Element to LaTeX/*from   w  w  w  .j  a v  a2 s.  c  o m*/
 *
 * @param element
 *            the root element to format
 * @return formatted text
 */
public static String getLatexText(Element element) {
    FormattingVisitor formatter = new FormattingVisitor();
    NodeTraversor traversor = new NodeTraversor(formatter);
    traversor.traverse(element); // walk the DOM, and call .head() and
    // .tail() for each node

    return formatter.toString();
}

From source file:com.twentyn.patentExtractor.PatentDocument.java

private static List<String> extractTextFromHTML(DocumentBuilder docBuilder, NodeList textNodes)
        throws ParserConfigurationException, TransformerConfigurationException, TransformerException,
        XPathExpressionException {
    List<String> allTextList = new ArrayList<>(0);
    if (textNodes != null) {
        for (int i = 0; i < textNodes.getLength(); i++) {
            Node n = textNodes.item(i);
            /* This extremely around-the-horn approach to handling text content is due to the mix of HTML and
             * XML in the patent body.  We use Jsoup to parse the HTML entities we find in the body, and use
             * its extremely convenient NodeVisitor API to recursively traverse the document and extract the
             * text content in reasonable chunks.
             *///w  ww  . j  ava 2s. c  om
            Document contentsDoc = Util.nodeToDocument(docBuilder, "body", n);
            String docText = Util.documentToString(contentsDoc);
            // With help from http://stackoverflow.com/questions/832620/stripping-html-tags-in-java
            org.jsoup.nodes.Document htmlDoc = Jsoup.parse(docText);
            HtmlVisitor visitor = new HtmlVisitor();
            NodeTraversor traversor = new NodeTraversor(visitor);
            traversor.traverse(htmlDoc);
            List<String> textSegments = visitor.getTextContent();
            allTextList.addAll(textSegments);
        }
    }
    return allTextList;
}

From source file:no.kantega.publishing.admin.content.htmlfilter.RemoveNestedSpanTagsFilter.java

private void copySafeNodes(Element source, Element dest) {
    CleaningVisitor cleaningVisitor = new CleaningVisitor(dest);
    NodeTraversor traversor = new NodeTraversor(cleaningVisitor);
    traversor.traverse(source);//  w  ww .j  ava  2 s . c  o  m
}

From source file:org.apache.jmeter.protocol.http.parser.JsoupBasedHtmlParser.java

@Override
public Iterator<URL> getEmbeddedResourceURLs(String userAgent, byte[] html, URL baseUrl, URLCollection coll,
        String encoding) throws HTMLParseException {
    try {//from   w  w w  . j a v a  2s  .c  om
        // TODO Handle conditional comments for IE
        String contents = new String(html, encoding);
        Document doc = Jsoup.parse(contents);
        JMeterNodeVisitor nodeVisitor = new JMeterNodeVisitor(new URLPointer(baseUrl), coll);
        new NodeTraversor(nodeVisitor).traverse(doc);
        return coll.iterator();
    } catch (Exception e) {
        throw new HTMLParseException(e);
    }
}

From source file:org.jsweet.input.typescriptdef.visitor.DocFiller.java

@Override
public void visitTypeDeclaration(TypeDeclaration typeDeclaration) {
    if (typeDeclaration.getDocumentation() == null) {
        String moduleName = getCurrentModuleName();
        if (JSweetDefTranslatorConfig.LANG_PACKAGE.equals(moduleName)
                || JSweetDefTranslatorConfig.DOM_PACKAGE.equals(moduleName)) {
            this.currentModule = moduleName;

            String content = getTypeContent(context.cacheDir, "mdn", moduleName, typeDeclaration.getName());
            if (content != null) {
                try {
                    Document doc = Jsoup.parse(content, "UTF-8");
                    NodeTraversor traversor;
                    traversor = new NodeTraversor(new MdnTableFormatGrabber(this, typeDeclaration));
                    traversor.traverse(doc.body());
                    traversor = new NodeTraversor(new MdnDefinitionListFormatGrabber(this, typeDeclaration));
                    traversor.traverse(doc.body());
                    traversor = new NodeTraversor(new MdnMainDescriptionGrabber(this, typeDeclaration));
                    traversor.traverse(doc.body());
                } catch (Throwable t) {
                    context.reportError("cannot fill documentation for " + context.getTypeName(typeDeclaration),
                            typeDeclaration.getToken(), t);
                }//  w w  w  . j  a v  a 2  s.  c om
            }
        }
    }
}

From source file:org.jsweet.input.typescriptdef.visitor.DocFiller.java

public static String removeTags(String html, String[] tagsToBeRemoved) {
    StringBuilder sb = new StringBuilder();
    NodeTraversor traversor = new NodeTraversor(new TagRemover(sb, tagsToBeRemoved));
    traversor.traverse(Jsoup.parse(html).body());
    return sb.toString().replace("<p></p>", "");
}