List of usage examples for org.jsoup.nodes TextNode text
public String text()
From source file:com.bibisco.export.HTMLParser.java
private static void parseNode(Node pNode, TextFormatting pTextFormatting, IExporter pExporter) { mLog.debug("Start parseNode(Node, TextFormatting, IExporter)"); if (pNode instanceof TextNode) { TextNode lTextNode = (TextNode) pNode; pExporter.addText(lTextNode.text(), pTextFormatting); } else {/*from w ww . j a va 2 s . c om*/ TextFormatting lTextFormatting = pTextFormatting.clone(); String lStrTagName = ""; if (pNode instanceof Element) { lStrTagName = ((Element) pNode).tag().getName(); } else if (pNode instanceof Comment) { lStrTagName = ((Comment) pNode).outerHtml(); } if (lStrTagName.equalsIgnoreCase("p")) { pExporter.startParagraph(getParagraphAlignment(((Element) pNode).attr("style"))); parseChildNodes(pNode, lTextFormatting, pExporter); pExporter.endParagraph(); } else if (lStrTagName.equalsIgnoreCase("ol")) { pExporter.startOrderedList(); parseChildNodes(pNode, lTextFormatting, pExporter); pExporter.endOrderedList(); } else if (lStrTagName.equalsIgnoreCase("ul")) { pExporter.startUnorderedList(); parseChildNodes(pNode, lTextFormatting, pExporter); pExporter.endUnorderedList(); } else if (lStrTagName.equalsIgnoreCase("li")) { pExporter.startListItem(); parseChildNodes(pNode, lTextFormatting, pExporter); pExporter.endListItem(); } else if (lStrTagName.equalsIgnoreCase("br")) { pExporter.addEmptyLines(1); parseChildNodes(pNode, lTextFormatting, pExporter); } else if (lStrTagName.equalsIgnoreCase("em")) { lTextFormatting.italic = true; parseChildNodes(pNode, lTextFormatting, pExporter); } else if (lStrTagName.equalsIgnoreCase("strike")) { lTextFormatting.strike = true; parseChildNodes(pNode, lTextFormatting, pExporter); } else if (lStrTagName.equalsIgnoreCase("strong")) { lTextFormatting.bold = true; parseChildNodes(pNode, lTextFormatting, pExporter); } else if (lStrTagName.equalsIgnoreCase("u")) { lTextFormatting.underline = true; parseChildNodes(pNode, lTextFormatting, pExporter); } else { parseChildNodes(pNode, lTextFormatting, pExporter); } } mLog.debug("End parseNode(Node, TextFormatting, IExporter)"); }
From source file:de.stkl.gbgvertretungsplan.sync.SyncAdapter.java
private Map<String, String> parseGeneralData(Element root, int dataType) { Map<String, String> generalData = new HashMap<String, String>(); // last update time and day Element updateTime = root.select("table.mon_head td:eq(2) p").first(); if (updateTime != null) { Pattern pat = Pattern.compile("(Stand: [\\.:0-9 ]+)", Pattern.DOTALL); Matcher matcher = pat.matcher(updateTime.text()); if (matcher.find()) generalData.put(Sync.GENERAL_DATA_UPDATETIME, matcher.group(1)); }/*from www.j a va2 s. c om*/ // date the substitution table belongs to Element belongingDate = root.select("div.mon_title").first(); if (belongingDate != null) generalData.put(Sync.GENERAL_DATA_DATE, belongingDate.text()); // daily information Elements dailyInfos = root.select("table.info tr"); int i = 0; for (Element info : dailyInfos) { Elements e = info.select("td"); if (e.size() == 0) continue; String title = "", description = ""; for (TextNode node : e.first().textNodes()) title += node.text() + '\n'; title = title.trim(); // description only if available if (e.size() > 1) { for (TextNode node : e.get(1).textNodes()) description += node.text() + '\n'; description = title.trim(); } String keyTitle = "", keyDescription = ""; switch (i) { case 0: keyTitle = Sync.GENERAL_DATA_DAILYINFO_1_TITLE; keyDescription = Sync.GENERAL_DATA_DAILYINFO_1_DESCRIPTION; break; case 1: keyTitle = Sync.GENERAL_DATA_DAILYINFO_2_TITLE; keyDescription = Sync.GENERAL_DATA_DAILYINFO_2_DESCRIPTION; break; case 2: keyTitle = Sync.GENERAL_DATA_DAILYINFO_3_TITLE; keyDescription = Sync.GENERAL_DATA_DAILYINFO_3_DESCRIPTION; break; default: break; } if (!keyTitle.equals("")) { generalData.put(keyTitle, title); generalData.put(keyDescription, description); } i++; } generalData.put(Sync.GENERAL_DATA_DATATYPE, String.valueOf(dataType)); return generalData; }
From source file:com.lingxiang2014.entity.Article.java
@Transient public String[] getPageContents() { if (StringUtils.isEmpty(content)) { return new String[] { "" }; }// w w w.ja v a 2 s . c om if (content.contains(PAGE_BREAK_SEPARATOR)) { return content.split(PAGE_BREAK_SEPARATOR); } else { List<String> pageContents = new ArrayList<String>(); Document document = Jsoup.parse(content); List<Node> children = document.body().childNodes(); if (children != null) { int textLength = 0; StringBuffer html = new StringBuffer(); for (Node node : children) { if (node instanceof Element) { Element element = (Element) node; html.append(element.outerHtml()); textLength += element.text().length(); if (textLength >= PAGE_CONTENT_LENGTH) { pageContents.add(html.toString()); textLength = 0; html.setLength(0); } } else if (node instanceof TextNode) { TextNode textNode = (TextNode) node; String text = textNode.text(); String[] contents = PARAGRAPH_SEPARATOR_PATTERN.split(text); Matcher matcher = PARAGRAPH_SEPARATOR_PATTERN.matcher(text); for (String content : contents) { if (matcher.find()) { content += matcher.group(); } html.append(content); textLength += content.length(); if (textLength >= PAGE_CONTENT_LENGTH) { pageContents.add(html.toString()); textLength = 0; html.setLength(0); } } } } String pageContent = html.toString(); if (StringUtils.isNotEmpty(pageContent)) { pageContents.add(pageContent); } } return pageContents.toArray(new String[pageContents.size()]); } }
From source file:by.heap.remark.convert.TextCleaner.java
private String getTextNodeText(TextNode tn, boolean normalText) { String input = normalText ? tn.text() : tn.getWholeText(); Node prev = tn.previousSibling(); Node next = tn.nextSibling(); boolean parentIsBlock = isBlock(tn.parent()); if (isBlock(prev)) { input = ltrim(input);/*from w ww . j a va2s . co m*/ } else if (prev == null && parentIsBlock) { input = ltrim(input); } else if (normalText && prev instanceof TextNode) { TextNode tprev = (TextNode) prev; if (EMPTY_MATCHER.matcher(tprev.text()).matches()) { input = ltrim(input); } } if (input.length() > 0) { if (isBlock(next)) { input = rtrim(input); } else if (next == null && parentIsBlock) { input = rtrim(input); } else if (normalText && next instanceof TextNode) { TextNode tnext = (TextNode) next; if (EMPTY_MATCHER.matcher(tnext.text()).matches()) { input = rtrim(input); } } } return input; }
From source file:me.vertretungsplan.parser.DaVinciParser.java
@NotNull void parsePage(Element doc, SubstitutionSchedule schedule) throws IOException { SubstitutionScheduleDay day = new SubstitutionScheduleDay(); Element titleElem;/*w ww . j a v a 2 s .com*/ if (doc.select("h1.list-table-caption").size() > 0) { titleElem = doc.select("h1.list-table-caption").first(); } else { // DaVinci 5 titleElem = doc.select("h2").first(); } String title = titleElem.text(); String klasse = null; // title can either be date or class Pattern datePattern = Pattern.compile("\\d+\\.\\d+.\\d{4}"); Matcher dateMatcher = datePattern.matcher(title); if (dateMatcher.find()) { day.setDateString(dateMatcher.group()); day.setDate(ParserUtils.parseDate(dateMatcher.group())); } else { klasse = title; String nextText = titleElem.nextElementSibling().text(); if (nextText.matches("\\w+ \\d+\\.\\d+.\\d{4}")) { day.setDateString(nextText); day.setDate(ParserUtils.parseDate(nextText)); } else { // could not find date, must be multiple days day = null; } } for (Element p : doc.select(".row:has(h1.list-table-caption) p")) { for (TextNode node : p.textNodes()) { if (!node.text().trim().isEmpty() && day != null) day.addMessage(node.text().trim()); } } for (Element message : doc.select(".callout")) { for (TextNode node : message.textNodes()) { if (!node.text().trim().isEmpty()) day.addMessage(node.text().trim()); } } Element lastChangeElem = doc.select(".row.copyright div").first(); if (lastChangeElem == null) { // DaVinci 5 lastChangeElem = doc.select("h1").first(); } String lastChange = lastChangeElem.ownText(); Pattern pattern = Pattern.compile("(\\d{2}-\\d{2}-\\d{4} \\d{2}:\\d{2}) \\|"); Matcher matcher = pattern.matcher(lastChange); if (matcher.find()) { LocalDateTime lastChangeTime = DateTimeFormat.forPattern("dd-MM-yyyy HH:mm") .parseLocalDateTime(matcher.group(1)); if (day != null) { day.setLastChange(lastChangeTime); } else { schedule.setLastChange(lastChangeTime); } } else { Pattern pattern2 = Pattern.compile("(\\d{2}.\\d{2}.\\d{4} \\| \\d+:\\d{2})"); Matcher matcher2 = pattern2.matcher(lastChange); if (matcher2.find()) { LocalDateTime lastChangeTime = DateTimeFormat.forPattern("dd.MM.yyyy | HH:mm") .parseLocalDateTime(matcher2.group(1)); if (day != null) { day.setLastChange(lastChangeTime); } else { schedule.setLastChange(lastChangeTime); } } } if (doc.select(".list-table").size() > 0 || !doc.select(".callout").text().contains("Es liegen keine")) { Element table = doc.select(".list-table, table").first(); parseDaVinciTable(table, schedule, klasse, day, colorProvider); } if (day != null) { schedule.addDay(day); } }
From source file:net.groupbuy.entity.Article.java
/** * ?/*from w w w.j ava 2 s.c om*/ * * @return */ @Transient public String[] getPageContents() { if (StringUtils.isEmpty(content)) { return new String[] { "" }; } if (content.contains(PAGE_BREAK_SEPARATOR)) { return content.split(PAGE_BREAK_SEPARATOR); } else { List<String> pageContents = new ArrayList<String>(); Document document = Jsoup.parse(content); List<Node> children = document.body().childNodes(); if (children != null) { int textLength = 0; StringBuffer html = new StringBuffer(); for (Node node : children) { if (node instanceof Element) { Element element = (Element) node; html.append(element.outerHtml()); textLength += element.text().length(); if (textLength >= PAGE_CONTENT_LENGTH) { pageContents.add(html.toString()); textLength = 0; html.setLength(0); } } else if (node instanceof TextNode) { TextNode textNode = (TextNode) node; String text = textNode.text(); String[] contents = PARAGRAPH_SEPARATOR_PATTERN.split(text); Matcher matcher = PARAGRAPH_SEPARATOR_PATTERN.matcher(text); for (String content : contents) { if (matcher.find()) { content += matcher.group(); } html.append(content); textLength += content.length(); if (textLength >= PAGE_CONTENT_LENGTH) { pageContents.add(html.toString()); textLength = 0; html.setLength(0); } } } } String pageContent = html.toString(); if (StringUtils.isNotEmpty(pageContent)) { pageContents.add(pageContent); } } return pageContents.toArray(new String[pageContents.size()]); } }
From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java
protected String getTitleByEditDistance(Element contentElement) throws Exception { final String metaTitle = doc.title(); final ArrayList<Double> max = new ArrayList<Double>(); max.add(0.0);// w w w .ja v a2 s. c o m final StringBuilder sb = new StringBuilder(); doc.body().traverse(new NodeVisitor() { public void head(Node node, int i) { if (node instanceof TextNode) { TextNode tn = (TextNode) node; String text = tn.text().trim(); double sim = strSim(text, metaTitle); if (sim > 0) { if (sim > max.get(0)) { max.set(0, sim); sb.setLength(0); sb.append(text); } } } } public void tail(Node node, int i) { } }); if (sb.length() > 0) { return sb.toString(); } throw new Exception(); }
From source file:mml.handler.post.MMLPostHTMLHandler.java
/** * Parse a span with a class or not//from w w w. j a v a 2 s.com * @param span the span in HTML */ private void parseSpan(Element span) throws JSONException { if (span.hasText()) { int offset = sb.length(); String name = span.attr("class"); Range r = new Range(name, offset, 0); if (name == null || name.length() == 0) name = "span"; if (isMilestone(name)) { pages.add(r); sb.append(span.text()); sb.append("\n"); pages.updateLen(r, sb.length() - offset); prevWasMilestone = true; } else if (name.equals("soft-hyphen")) { stil.add(r); // get previous word int i = sb.length() - 1; while (i > 0 && !Character.isWhitespace(sb.charAt(i))) i--; if (i > 0) i++; String prev = clean(sb.substring(i), true); // get next word String next = clean(nextWord(span), false); if (this.speller.isHardHyphen(prev, next)) r.name = "hard-hyphen"; sb.append(span.text()); stil.updateLen(r, sb.length() - offset); } else // span may contain other spans { stil.add(r); List<Node> children = span.childNodes(); for (Node child : children) { if (child instanceof Element) { String nName = child.nodeName().toLowerCase(); if (nName.equals("span")) parseSpan((Element) child); else parseOtherElement((Element) child); } else if (child instanceof TextNode) { TextNode tn = (TextNode) child; sb.append(tn.text()); } } if (isLineFormat(name)) ensure(1, false); stil.updateLen(r, sb.length() - offset); } } // else strangely no text: ignore it }
From source file:cn.edu.hfut.dmic.contentextractor.ContentExtractor.java
/** * @param node //w w w .j a va2 s.c o m * 1. styleclass * 2. ????density??? * 3. p??? * @return */ protected CountInfo computeInfo(Node node) { if (node instanceof Element) { node.removeAttr("style").removeAttr("class"); Element tag = (Element) node; if (tag.text().matches(".{1,20}>.{1,10}>.{1,20}")) { CountInfo countInfo = new CountInfo(); countInfo.density = -200; return countInfo; } CountInfo countInfo = new CountInfo(); for (Node childNode : tag.childNodes()) { CountInfo childCountInfo = computeInfo(childNode); countInfo.textCount += childCountInfo.textCount; countInfo.linkTextCount += childCountInfo.linkTextCount; countInfo.tagCount += childCountInfo.tagCount; countInfo.linkTagCount += childCountInfo.linkTagCount; countInfo.leafList.addAll(childCountInfo.leafList); countInfo.densitySum += childCountInfo.density; countInfo.pCount += childCountInfo.pCount; } countInfo.tagCount++; String tagName = tag.tagName(); if (tagName.equals("a") || tagName.equals("img")) { countInfo.linkTextCount = countInfo.textCount; countInfo.linkTagCount++; } else if (tagName.equals("p")) { countInfo.pCount++; } int pureLen = countInfo.textCount - countInfo.linkTextCount; int len = countInfo.tagCount - countInfo.linkTagCount; if (pureLen == 0 || len == 0) { countInfo.density = 0; } else { countInfo.density = (pureLen + 0.0) / len; } infoMap.put(tag, countInfo); return countInfo; } else if (node instanceof TextNode) { TextNode tn = (TextNode) node; CountInfo countInfo = new CountInfo(); String text = tn.text(); int len = text.length(); countInfo.textCount = len; countInfo.leafList.add(len); return countInfo; } else { return new CountInfo(); } }
From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java
/** * Extract text and hyperlinks from an element * supposingly containing only text./*from w w w . j av a 2 s . c o m*/ * * @param textElement * The element to be processed. * @param rawStr * The StringBuffer to contain the raw text. * @param linkedStr * The StringBuffer to contain the text with hyperlinks. */ private void processTextElement(Element textElement, StringBuilder rawStr, StringBuilder linkedStr) { // we process each element contained in the specified text element for (Node node : textElement.childNodes()) { // element node if (node instanceof Element) { Element element = (Element) node; String eltName = element.tag().getName(); // section headers: same thing if (eltName.equals(XmlNames.ELT_H2) || eltName.equals(XmlNames.ELT_H3) || eltName.equals(XmlNames.ELT_H4) || eltName.equals(XmlNames.ELT_H5) || eltName.equals(XmlNames.ELT_H6)) { processParagraphElement(element, rawStr, linkedStr); } // paragraphs inside paragraphs are processed recursively else if (eltName.equals(XmlNames.ELT_P)) { processParagraphElement(element, rawStr, linkedStr); } // superscripts are to be avoided else if (eltName.equals(XmlNames.ELT_SUP)) { // they are either external references or WP inline notes // cf. http://en.wikipedia.org/wiki/Template%3ACitation_needed } // small caps are placed before phonetic transcriptions of names, which we avoid else if (eltName.equals(XmlNames.ELT_SMALL)) { // we don't need them, and they can mess up NER tools } // we ignore certain types of span (phonetic trancription, WP buttons...) else if (eltName.equals(XmlNames.ELT_SPAN)) { processSpanElement(element, rawStr, linkedStr); } // hyperlinks must be included in the linked string, provided they are not external else if (eltName.equals(XmlNames.ELT_A)) { processHyperlinkElement(element, rawStr, linkedStr); } // lists else if (eltName.equals(XmlNames.ELT_UL)) { processListElement(element, rawStr, linkedStr, false); } else if (eltName.equals(XmlNames.ELT_OL)) { processListElement(element, rawStr, linkedStr, true); } else if (eltName.equals(XmlNames.ELT_DL)) { processDescriptionListElement(element, rawStr, linkedStr); } // list item else if (eltName.equals(XmlNames.ELT_LI)) { processTextElement(element, rawStr, linkedStr); } // divisions are just processed recursively else if (eltName.equals(XmlNames.ELT_DIV)) { processDivisionElement(element, rawStr, linkedStr); } // quotes are just processed recursively else if (eltName.equals(XmlNames.ELT_BLOCKQUOTE)) { processQuoteElement(element, rawStr, linkedStr); } // citation else if (eltName.equals(XmlNames.ELT_CITE)) { processParagraphElement(element, rawStr, linkedStr); } // other elements are considered as simple text else { String text = element.text(); rawStr.append(text); linkedStr.append(text); } } // text node else if (node instanceof TextNode) { // get the text TextNode textNode = (TextNode) node; String text = textNode.text(); // if at the begining of a new line, or already preceeded by a space, remove leading spaces while (rawStr.length() > 0 && (rawStr.charAt(rawStr.length() - 1) == '\n' || rawStr.charAt(rawStr.length() - 1) == ' ') && text.startsWith(" ")) text = text.substring(1); // complete string buffers rawStr.append(text); linkedStr.append(text); } } }