Example usage for org.jsoup.nodes TextNode text

List of usage examples for org.jsoup.nodes TextNode text

Introduction

In this page you can find the example usage for org.jsoup.nodes TextNode text.

Prototype

public String text() 

Source Link

Document

Get the text content of this text node.

Usage

From source file:com.bibisco.export.HTMLParser.java

private static void parseNode(Node pNode, TextFormatting pTextFormatting, IExporter pExporter) {

    mLog.debug("Start parseNode(Node, TextFormatting, IExporter)");

    if (pNode instanceof TextNode) {

        TextNode lTextNode = (TextNode) pNode;
        pExporter.addText(lTextNode.text(), pTextFormatting);

    } else {/*from   w ww  .  j  a va  2 s  .  c  om*/
        TextFormatting lTextFormatting = pTextFormatting.clone();

        String lStrTagName = "";
        if (pNode instanceof Element) {
            lStrTagName = ((Element) pNode).tag().getName();
        } else if (pNode instanceof Comment) {
            lStrTagName = ((Comment) pNode).outerHtml();
        }

        if (lStrTagName.equalsIgnoreCase("p")) {
            pExporter.startParagraph(getParagraphAlignment(((Element) pNode).attr("style")));
            parseChildNodes(pNode, lTextFormatting, pExporter);
            pExporter.endParagraph();
        } else if (lStrTagName.equalsIgnoreCase("ol")) {
            pExporter.startOrderedList();
            parseChildNodes(pNode, lTextFormatting, pExporter);
            pExporter.endOrderedList();
        } else if (lStrTagName.equalsIgnoreCase("ul")) {
            pExporter.startUnorderedList();
            parseChildNodes(pNode, lTextFormatting, pExporter);
            pExporter.endUnorderedList();
        } else if (lStrTagName.equalsIgnoreCase("li")) {
            pExporter.startListItem();
            parseChildNodes(pNode, lTextFormatting, pExporter);
            pExporter.endListItem();
        } else if (lStrTagName.equalsIgnoreCase("br")) {
            pExporter.addEmptyLines(1);
            parseChildNodes(pNode, lTextFormatting, pExporter);
        } else if (lStrTagName.equalsIgnoreCase("em")) {
            lTextFormatting.italic = true;
            parseChildNodes(pNode, lTextFormatting, pExporter);
        } else if (lStrTagName.equalsIgnoreCase("strike")) {
            lTextFormatting.strike = true;
            parseChildNodes(pNode, lTextFormatting, pExporter);
        } else if (lStrTagName.equalsIgnoreCase("strong")) {
            lTextFormatting.bold = true;
            parseChildNodes(pNode, lTextFormatting, pExporter);
        } else if (lStrTagName.equalsIgnoreCase("u")) {
            lTextFormatting.underline = true;
            parseChildNodes(pNode, lTextFormatting, pExporter);
        } else {
            parseChildNodes(pNode, lTextFormatting, pExporter);
        }
    }

    mLog.debug("End parseNode(Node, TextFormatting, IExporter)");
}

From source file:de.stkl.gbgvertretungsplan.sync.SyncAdapter.java

private Map<String, String> parseGeneralData(Element root, int dataType) {
    Map<String, String> generalData = new HashMap<String, String>();
    // last update time and day
    Element updateTime = root.select("table.mon_head td:eq(2) p").first();
    if (updateTime != null) {
        Pattern pat = Pattern.compile("(Stand: [\\.:0-9 ]+)", Pattern.DOTALL);
        Matcher matcher = pat.matcher(updateTime.text());
        if (matcher.find())
            generalData.put(Sync.GENERAL_DATA_UPDATETIME, matcher.group(1));
    }/*from www.j  a  va2 s.  c  om*/
    // date the substitution table belongs to
    Element belongingDate = root.select("div.mon_title").first();
    if (belongingDate != null)
        generalData.put(Sync.GENERAL_DATA_DATE, belongingDate.text());

    // daily information
    Elements dailyInfos = root.select("table.info tr");
    int i = 0;
    for (Element info : dailyInfos) {
        Elements e = info.select("td");
        if (e.size() == 0)
            continue;

        String title = "", description = "";
        for (TextNode node : e.first().textNodes())
            title += node.text() + '\n';
        title = title.trim();

        // description only if available
        if (e.size() > 1) {
            for (TextNode node : e.get(1).textNodes())
                description += node.text() + '\n';
            description = title.trim();
        }

        String keyTitle = "", keyDescription = "";
        switch (i) {
        case 0:
            keyTitle = Sync.GENERAL_DATA_DAILYINFO_1_TITLE;
            keyDescription = Sync.GENERAL_DATA_DAILYINFO_1_DESCRIPTION;
            break;
        case 1:
            keyTitle = Sync.GENERAL_DATA_DAILYINFO_2_TITLE;
            keyDescription = Sync.GENERAL_DATA_DAILYINFO_2_DESCRIPTION;
            break;
        case 2:
            keyTitle = Sync.GENERAL_DATA_DAILYINFO_3_TITLE;
            keyDescription = Sync.GENERAL_DATA_DAILYINFO_3_DESCRIPTION;
            break;
        default:
            break;
        }
        if (!keyTitle.equals("")) {
            generalData.put(keyTitle, title);
            generalData.put(keyDescription, description);
        }
        i++;
    }

    generalData.put(Sync.GENERAL_DATA_DATATYPE, String.valueOf(dataType));

    return generalData;
}

From source file:com.lingxiang2014.entity.Article.java

@Transient
public String[] getPageContents() {
    if (StringUtils.isEmpty(content)) {
        return new String[] { "" };
    }// w w  w.ja  v  a 2  s .  c  om
    if (content.contains(PAGE_BREAK_SEPARATOR)) {
        return content.split(PAGE_BREAK_SEPARATOR);
    } else {
        List<String> pageContents = new ArrayList<String>();
        Document document = Jsoup.parse(content);
        List<Node> children = document.body().childNodes();
        if (children != null) {
            int textLength = 0;
            StringBuffer html = new StringBuffer();
            for (Node node : children) {
                if (node instanceof Element) {
                    Element element = (Element) node;
                    html.append(element.outerHtml());
                    textLength += element.text().length();
                    if (textLength >= PAGE_CONTENT_LENGTH) {
                        pageContents.add(html.toString());
                        textLength = 0;
                        html.setLength(0);
                    }
                } else if (node instanceof TextNode) {
                    TextNode textNode = (TextNode) node;
                    String text = textNode.text();
                    String[] contents = PARAGRAPH_SEPARATOR_PATTERN.split(text);
                    Matcher matcher = PARAGRAPH_SEPARATOR_PATTERN.matcher(text);
                    for (String content : contents) {
                        if (matcher.find()) {
                            content += matcher.group();
                        }
                        html.append(content);
                        textLength += content.length();
                        if (textLength >= PAGE_CONTENT_LENGTH) {
                            pageContents.add(html.toString());
                            textLength = 0;
                            html.setLength(0);
                        }
                    }
                }
            }
            String pageContent = html.toString();
            if (StringUtils.isNotEmpty(pageContent)) {
                pageContents.add(pageContent);
            }
        }
        return pageContents.toArray(new String[pageContents.size()]);
    }
}

From source file:by.heap.remark.convert.TextCleaner.java

private String getTextNodeText(TextNode tn, boolean normalText) {
    String input = normalText ? tn.text() : tn.getWholeText();
    Node prev = tn.previousSibling();
    Node next = tn.nextSibling();
    boolean parentIsBlock = isBlock(tn.parent());
    if (isBlock(prev)) {
        input = ltrim(input);/*from w ww  . j  a  va2s .  co  m*/
    } else if (prev == null && parentIsBlock) {
        input = ltrim(input);
    } else if (normalText && prev instanceof TextNode) {
        TextNode tprev = (TextNode) prev;
        if (EMPTY_MATCHER.matcher(tprev.text()).matches()) {
            input = ltrim(input);
        }
    }
    if (input.length() > 0) {
        if (isBlock(next)) {
            input = rtrim(input);
        } else if (next == null && parentIsBlock) {
            input = rtrim(input);
        } else if (normalText && next instanceof TextNode) {
            TextNode tnext = (TextNode) next;
            if (EMPTY_MATCHER.matcher(tnext.text()).matches()) {
                input = rtrim(input);
            }
        }
    }
    return input;
}

From source file:me.vertretungsplan.parser.DaVinciParser.java

@NotNull
void parsePage(Element doc, SubstitutionSchedule schedule) throws IOException {
    SubstitutionScheduleDay day = new SubstitutionScheduleDay();

    Element titleElem;/*w  ww .  j  a  v  a  2  s .com*/
    if (doc.select("h1.list-table-caption").size() > 0) {
        titleElem = doc.select("h1.list-table-caption").first();
    } else {
        // DaVinci 5
        titleElem = doc.select("h2").first();
    }
    String title = titleElem.text();
    String klasse = null;
    // title can either be date or class
    Pattern datePattern = Pattern.compile("\\d+\\.\\d+.\\d{4}");
    Matcher dateMatcher = datePattern.matcher(title);
    if (dateMatcher.find()) {
        day.setDateString(dateMatcher.group());
        day.setDate(ParserUtils.parseDate(dateMatcher.group()));
    } else {
        klasse = title;
        String nextText = titleElem.nextElementSibling().text();
        if (nextText.matches("\\w+ \\d+\\.\\d+.\\d{4}")) {
            day.setDateString(nextText);
            day.setDate(ParserUtils.parseDate(nextText));
        } else {
            // could not find date, must be multiple days
            day = null;
        }
    }

    for (Element p : doc.select(".row:has(h1.list-table-caption) p")) {
        for (TextNode node : p.textNodes()) {
            if (!node.text().trim().isEmpty() && day != null)
                day.addMessage(node.text().trim());
        }
    }
    for (Element message : doc.select(".callout")) {
        for (TextNode node : message.textNodes()) {
            if (!node.text().trim().isEmpty())
                day.addMessage(node.text().trim());
        }
    }

    Element lastChangeElem = doc.select(".row.copyright div").first();
    if (lastChangeElem == null) {
        // DaVinci 5
        lastChangeElem = doc.select("h1").first();
    }
    String lastChange = lastChangeElem.ownText();
    Pattern pattern = Pattern.compile("(\\d{2}-\\d{2}-\\d{4} \\d{2}:\\d{2}) \\|");
    Matcher matcher = pattern.matcher(lastChange);
    if (matcher.find()) {
        LocalDateTime lastChangeTime = DateTimeFormat.forPattern("dd-MM-yyyy HH:mm")
                .parseLocalDateTime(matcher.group(1));
        if (day != null) {
            day.setLastChange(lastChangeTime);
        } else {
            schedule.setLastChange(lastChangeTime);
        }
    } else {
        Pattern pattern2 = Pattern.compile("(\\d{2}.\\d{2}.\\d{4} \\| \\d+:\\d{2})");
        Matcher matcher2 = pattern2.matcher(lastChange);
        if (matcher2.find()) {
            LocalDateTime lastChangeTime = DateTimeFormat.forPattern("dd.MM.yyyy | HH:mm")
                    .parseLocalDateTime(matcher2.group(1));
            if (day != null) {
                day.setLastChange(lastChangeTime);
            } else {
                schedule.setLastChange(lastChangeTime);
            }
        }
    }

    if (doc.select(".list-table").size() > 0 || !doc.select(".callout").text().contains("Es liegen keine")) {
        Element table = doc.select(".list-table, table").first();
        parseDaVinciTable(table, schedule, klasse, day, colorProvider);
    }

    if (day != null) {
        schedule.addDay(day);
    }
}

From source file:net.groupbuy.entity.Article.java

/**
 * ?/*from   w  w w.j ava 2 s.c  om*/
 * 
 * @return 
 */
@Transient
public String[] getPageContents() {
    if (StringUtils.isEmpty(content)) {
        return new String[] { "" };
    }
    if (content.contains(PAGE_BREAK_SEPARATOR)) {
        return content.split(PAGE_BREAK_SEPARATOR);
    } else {
        List<String> pageContents = new ArrayList<String>();
        Document document = Jsoup.parse(content);
        List<Node> children = document.body().childNodes();
        if (children != null) {
            int textLength = 0;
            StringBuffer html = new StringBuffer();
            for (Node node : children) {
                if (node instanceof Element) {
                    Element element = (Element) node;
                    html.append(element.outerHtml());
                    textLength += element.text().length();
                    if (textLength >= PAGE_CONTENT_LENGTH) {
                        pageContents.add(html.toString());
                        textLength = 0;
                        html.setLength(0);
                    }
                } else if (node instanceof TextNode) {
                    TextNode textNode = (TextNode) node;
                    String text = textNode.text();
                    String[] contents = PARAGRAPH_SEPARATOR_PATTERN.split(text);
                    Matcher matcher = PARAGRAPH_SEPARATOR_PATTERN.matcher(text);
                    for (String content : contents) {
                        if (matcher.find()) {
                            content += matcher.group();
                        }
                        html.append(content);
                        textLength += content.length();
                        if (textLength >= PAGE_CONTENT_LENGTH) {
                            pageContents.add(html.toString());
                            textLength = 0;
                            html.setLength(0);
                        }
                    }
                }
            }
            String pageContent = html.toString();
            if (StringUtils.isNotEmpty(pageContent)) {
                pageContents.add(pageContent);
            }
        }
        return pageContents.toArray(new String[pageContents.size()]);
    }
}

From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java

protected String getTitleByEditDistance(Element contentElement) throws Exception {
    final String metaTitle = doc.title();

    final ArrayList<Double> max = new ArrayList<Double>();
    max.add(0.0);//  w  w w .ja  v a2 s.  c  o m
    final StringBuilder sb = new StringBuilder();
    doc.body().traverse(new NodeVisitor() {

        public void head(Node node, int i) {

            if (node instanceof TextNode) {
                TextNode tn = (TextNode) node;
                String text = tn.text().trim();
                double sim = strSim(text, metaTitle);
                if (sim > 0) {
                    if (sim > max.get(0)) {
                        max.set(0, sim);
                        sb.setLength(0);
                        sb.append(text);
                    }
                }

            }
        }

        public void tail(Node node, int i) {
        }
    });
    if (sb.length() > 0) {
        return sb.toString();
    }
    throw new Exception();

}

From source file:mml.handler.post.MMLPostHTMLHandler.java

/**
 * Parse a span with a class or not//from  w w  w. j a  v  a  2  s.com
 * @param span the span in HTML
 */
private void parseSpan(Element span) throws JSONException {
    if (span.hasText()) {
        int offset = sb.length();
        String name = span.attr("class");
        Range r = new Range(name, offset, 0);
        if (name == null || name.length() == 0)
            name = "span";
        if (isMilestone(name)) {
            pages.add(r);
            sb.append(span.text());
            sb.append("\n");
            pages.updateLen(r, sb.length() - offset);
            prevWasMilestone = true;
        } else if (name.equals("soft-hyphen")) {
            stil.add(r);
            // get previous word
            int i = sb.length() - 1;
            while (i > 0 && !Character.isWhitespace(sb.charAt(i)))
                i--;
            if (i > 0)
                i++;
            String prev = clean(sb.substring(i), true);
            // get next word
            String next = clean(nextWord(span), false);
            if (this.speller.isHardHyphen(prev, next))
                r.name = "hard-hyphen";
            sb.append(span.text());
            stil.updateLen(r, sb.length() - offset);
        } else // span may contain other spans
        {
            stil.add(r);
            List<Node> children = span.childNodes();
            for (Node child : children) {
                if (child instanceof Element) {
                    String nName = child.nodeName().toLowerCase();
                    if (nName.equals("span"))
                        parseSpan((Element) child);
                    else
                        parseOtherElement((Element) child);
                } else if (child instanceof TextNode) {
                    TextNode tn = (TextNode) child;
                    sb.append(tn.text());
                }
            }
            if (isLineFormat(name))
                ensure(1, false);
            stil.updateLen(r, sb.length() - offset);
        }
    }
    // else strangely no text: ignore it
}

From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java

/**
 * @param node //w w w .j a  va2 s.c  o m
 *             1. styleclass
 *             2. ????density???
 *             3. p???
 * @return
 */
protected CountInfo computeInfo(Node node) {
    if (node instanceof Element) {
        node.removeAttr("style").removeAttr("class");
        Element tag = (Element) node;

        if (tag.text().matches(".{1,20}>.{1,10}>.{1,20}")) {
            CountInfo countInfo = new CountInfo();
            countInfo.density = -200;
            return countInfo;
        }
        CountInfo countInfo = new CountInfo();
        for (Node childNode : tag.childNodes()) {
            CountInfo childCountInfo = computeInfo(childNode);
            countInfo.textCount += childCountInfo.textCount;
            countInfo.linkTextCount += childCountInfo.linkTextCount;
            countInfo.tagCount += childCountInfo.tagCount;
            countInfo.linkTagCount += childCountInfo.linkTagCount;
            countInfo.leafList.addAll(childCountInfo.leafList);
            countInfo.densitySum += childCountInfo.density;
            countInfo.pCount += childCountInfo.pCount;
        }

        countInfo.tagCount++;
        String tagName = tag.tagName();
        if (tagName.equals("a") || tagName.equals("img")) {
            countInfo.linkTextCount = countInfo.textCount;
            countInfo.linkTagCount++;
        } else if (tagName.equals("p")) {
            countInfo.pCount++;
        }

        int pureLen = countInfo.textCount - countInfo.linkTextCount;
        int len = countInfo.tagCount - countInfo.linkTagCount;
        if (pureLen == 0 || len == 0) {
            countInfo.density = 0;
        } else {
            countInfo.density = (pureLen + 0.0) / len;
        }

        infoMap.put(tag, countInfo);

        return countInfo;
    } else if (node instanceof TextNode) {
        TextNode tn = (TextNode) node;
        CountInfo countInfo = new CountInfo();
        String text = tn.text();
        int len = text.length();
        countInfo.textCount = len;
        countInfo.leafList.add(len);
        return countInfo;
    } else {
        return new CountInfo();
    }
}

From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java

/**
 * Extract text and hyperlinks from an element
 * supposingly containing only text./*from  w  w  w  . j  av  a 2 s  . c o  m*/
 * 
 * @param textElement
 *       The element to be processed.
 * @param rawStr
 *       The StringBuffer to contain the raw text.
 * @param linkedStr
 *       The StringBuffer to contain the text with hyperlinks.
 */
private void processTextElement(Element textElement, StringBuilder rawStr, StringBuilder linkedStr) { // we process each element contained in the specified text element
    for (Node node : textElement.childNodes()) { // element node
        if (node instanceof Element) {
            Element element = (Element) node;
            String eltName = element.tag().getName();

            // section headers: same thing
            if (eltName.equals(XmlNames.ELT_H2) || eltName.equals(XmlNames.ELT_H3)
                    || eltName.equals(XmlNames.ELT_H4) || eltName.equals(XmlNames.ELT_H5)
                    || eltName.equals(XmlNames.ELT_H6)) {
                processParagraphElement(element, rawStr, linkedStr);
            }

            // paragraphs inside paragraphs are processed recursively
            else if (eltName.equals(XmlNames.ELT_P)) {
                processParagraphElement(element, rawStr, linkedStr);
            }

            // superscripts are to be avoided
            else if (eltName.equals(XmlNames.ELT_SUP)) { // they are either external references or WP inline notes
                                                         // cf. http://en.wikipedia.org/wiki/Template%3ACitation_needed
            }

            // small caps are placed before phonetic transcriptions of names, which we avoid
            else if (eltName.equals(XmlNames.ELT_SMALL)) { // we don't need them, and they can mess up NER tools
            }

            // we ignore certain types of span (phonetic trancription, WP buttons...) 
            else if (eltName.equals(XmlNames.ELT_SPAN)) {
                processSpanElement(element, rawStr, linkedStr);
            }

            // hyperlinks must be included in the linked string, provided they are not external
            else if (eltName.equals(XmlNames.ELT_A)) {
                processHyperlinkElement(element, rawStr, linkedStr);
            }

            // lists
            else if (eltName.equals(XmlNames.ELT_UL)) {
                processListElement(element, rawStr, linkedStr, false);
            } else if (eltName.equals(XmlNames.ELT_OL)) {
                processListElement(element, rawStr, linkedStr, true);
            } else if (eltName.equals(XmlNames.ELT_DL)) {
                processDescriptionListElement(element, rawStr, linkedStr);
            }

            // list item
            else if (eltName.equals(XmlNames.ELT_LI)) {
                processTextElement(element, rawStr, linkedStr);
            }

            // divisions are just processed recursively
            else if (eltName.equals(XmlNames.ELT_DIV)) {
                processDivisionElement(element, rawStr, linkedStr);
            }

            // quotes are just processed recursively
            else if (eltName.equals(XmlNames.ELT_BLOCKQUOTE)) {
                processQuoteElement(element, rawStr, linkedStr);
            }
            // citation
            else if (eltName.equals(XmlNames.ELT_CITE)) {
                processParagraphElement(element, rawStr, linkedStr);
            }

            // other elements are considered as simple text
            else {
                String text = element.text();
                rawStr.append(text);
                linkedStr.append(text);
            }
        }

        // text node
        else if (node instanceof TextNode) { // get the text
            TextNode textNode = (TextNode) node;
            String text = textNode.text();
            // if at the begining of a new line, or already preceeded by a space, remove leading spaces
            while (rawStr.length() > 0
                    && (rawStr.charAt(rawStr.length() - 1) == '\n' || rawStr.charAt(rawStr.length() - 1) == ' ')
                    && text.startsWith(" "))
                text = text.substring(1);
            // complete string buffers
            rawStr.append(text);
            linkedStr.append(text);
        }
    }
}