List of usage examples for org.jsoup.nodes Element tag
Tag tag
To view the source code for org.jsoup.nodes Element tag.
Click Source Link
From source file:Main.java
private static boolean preserveWhitespace(Node node) { // looks only at this element and one level up, to prevent recursion & needless stack searches if (node != null && node instanceof Element) { Element element = (Element) node; return element.tag().preserveWhitespace() || element.parent() != null && element.parent().tag().preserveWhitespace(); }//from w w w.ja v a2 s. co m return false; }
From source file:Main.java
private static void appendNewlineIfBr(Element element, StringBuilder accum) { if (element.tag().getName().equals("br")) accum.append("\n"); }
From source file:edu.ucla.cs.scai.swim.qa.ontology.dbpedia.tipicality.DbpediaCsvDownload.java
private static void download(Element e) throws MalformedURLException, IOException { for (Element c : e.children()) { String tagName = c.tag().getName(); if (tagName.equals("small")) { for (Element c1 : c.children()) { if (c1.tag().getName().equals("a") && c1.text().equalsIgnoreCase("csv")) { String href = c1.attr("href"); System.out.println("Downloading " + href); try { URL remoteFile = new URL(href); ReadableByteChannel rbc = Channels.newChannel(remoteFile.openStream()); String[] s = href.split("\\/"); FileOutputStream fos = new FileOutputStream( DBpediaOntology.DBPEDIA_CSV_FOLDER + s[s.length - 1]); fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE); } catch (Exception ex) { ex.printStackTrace(); }/* w w w . jav a 2 s . com*/ } } } else if (tagName.equals("ul")) { for (Element c1 : c.children()) { if (c1.tagName().equals("li")) { download(c1); } } } } }
From source file:com.gistlabs.mechanize.document.html.JsoupDataUtil.java
private static void filterElementsByTag(List<Element> results, Element element, Set<String> tagSet) { if (tagSet.contains(element.tag().getName().toLowerCase())) results.add(element);//from w ww.j ava 2 s . c o m for (Element child : element.children()) filterElementsByTag(results, child, tagSet); }
From source file:com.soulgalore.crawler.core.impl.AhrefPageURLParser.java
private Set<CrawlerURL> fetch(String query, String attributeKey, Document doc, String url) { final Set<CrawlerURL> urls = new HashSet<CrawlerURL>(); final Elements elements = doc.select(query); for (Element src : elements) { if (src.attr(attributeKey).isEmpty()) continue; // don't fetch mailto links if (src.attr(attributeKey).startsWith(MAIL_TO)) continue; else if (IFRAME.equals(src.tag().getName())) urls.add(new CrawlerURL(src.attr(attributeKey), url)); else/*from w w w.j a v a 2s . co m*/ urls.add(new CrawlerURL(src.attr(attributeKey), url)); } return urls; }
From source file:com.jejking.hh.nord.corpus.AllrisHtmlToRawDrucksache.java
private ImmutableList<String> druckSachenContents(Document htmlDoc) { /*//from w w w . j av a2 s . c o m * In this way we can identify the bits of "RTF" like text inserted into the overall HTML. * JSoup cleans up the broken HTML removing the xml declaration and inserted html roots * that ALLRIS manages to put in. */ Elements contentMetaElements = htmlDoc.getElementsByAttributeValue("name", "generator"); ImmutableList.Builder<String> listBuilder = ImmutableList.builder(); /* * Iterate over our candidates. Sometimes there are several. */ for (Element contentMetaElement : contentMetaElements) { StringBuilder contentAsTextBuilder = new StringBuilder(); Element nextSibling = contentMetaElement.nextElementSibling(); /* * In the cleaned up HTML DOM returned by JSoup the "RTF" content is * rendered as siblings of the meta node (JSoup having removed the html, head, body * elements which should never have been there in the first place). */ while (nextSibling != null && !nextSibling.tag().equals("meta")) { contentAsTextBuilder.append(nextSibling.text()); nextSibling = nextSibling.nextElementSibling(); } /* * Only carry over non-empty content. */ String contentAsText = contentAsTextBuilder.toString(); if (!removeNonBreakingSpacesAndTrim(contentAsText).isEmpty()) { listBuilder.add(contentAsText); } } return listBuilder.build(); }
From source file:edu.ucla.cs.scai.swim.qa.ontology.dbpedia.DBpediaOntologyOld.java
private void traverseHierarchy(Element e, DBpediaCategory category, HashMap<String, DBpediaCategory> map) { for (Element c : e.children()) { String tagName = c.tag().getName(); if (tagName.equals("a")) { String href = c.attr("href"); if (href != null && href.length() > 0) { category.setLabel(c.text()); category.setUri(CLASSES_BASE_URI + c.text()); map.put(category.getLabel(), category); System.out.println(c.text() + "\t" + CLASSES_BASE_URI + c.text()); }//from www . j av a 2s .c o m } else if (tagName.equals("ul")) { for (Element c1 : c.children()) { if (c1.tagName().equals("li")) { DBpediaCategory cc = new DBpediaCategory(); traverseHierarchy(c1, cc, map); cc.parents = new HashSet<>(); cc.parents.add(category); category.getSubClasses().add(cc); } } } } }
From source file:de.geeksfactory.opacclient.apis.Open.java
/** * Better version of JSoup's implementation of this function ({@link * org.jsoup.nodes.FormElement#formData()}). * * @param form The form to submit/* w ww . jav a 2 s .c o m*/ * @param submitName The name attribute of the button which is clicked to submit the form, or * null * @return A MultipartEntityBuilder containing the data of the form */ protected MultipartEntityBuilder formData(FormElement form, String submitName) { MultipartEntityBuilder data = MultipartEntityBuilder.create(); data.setLaxMode(); // iterate the form control elements and accumulate their values for (Element el : form.elements()) { if (!el.tag().isFormSubmittable()) { continue; // contents are form listable, superset of submitable } String name = el.attr("name"); if (name.length() == 0) continue; String type = el.attr("type"); if ("select".equals(el.tagName())) { Elements options = el.select("option[selected]"); boolean set = false; for (Element option : options) { data.addTextBody(name, option.val()); set = true; } if (!set) { Element option = el.select("option").first(); if (option != null) { data.addTextBody(name, option.val()); } } } else if ("checkbox".equalsIgnoreCase(type) || "radio".equalsIgnoreCase(type)) { // only add checkbox or radio if they have the checked attribute if (el.hasAttr("checked")) { data.addTextBody(name, el.val().length() > 0 ? el.val() : "on"); } } else if ("submit".equalsIgnoreCase(type) || "image".equalsIgnoreCase(type)) { if (submitName != null && el.attr("name").contains(submitName)) { data.addTextBody(name, el.val()); } } else { data.addTextBody(name, el.val()); } } return data; }
From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java
/** * Extract text and hyperlinks from an element * supposingly containing only text.//from w ww . j a va 2 s . c o m * * @param textElement * The element to be processed. * @param rawStr * The StringBuffer to contain the raw text. * @param linkedStr * The StringBuffer to contain the text with hyperlinks. */ private void processTextElement(Element textElement, StringBuilder rawStr, StringBuilder linkedStr) { // we process each element contained in the specified text element for (Node node : textElement.childNodes()) { // element node if (node instanceof Element) { Element element = (Element) node; String eltName = element.tag().getName(); // section headers: same thing if (eltName.equals(XmlNames.ELT_H2) || eltName.equals(XmlNames.ELT_H3) || eltName.equals(XmlNames.ELT_H4) || eltName.equals(XmlNames.ELT_H5) || eltName.equals(XmlNames.ELT_H6)) { processParagraphElement(element, rawStr, linkedStr); } // paragraphs inside paragraphs are processed recursively else if (eltName.equals(XmlNames.ELT_P)) { processParagraphElement(element, rawStr, linkedStr); } // superscripts are to be avoided else if (eltName.equals(XmlNames.ELT_SUP)) { // they are either external references or WP inline notes // cf. http://en.wikipedia.org/wiki/Template%3ACitation_needed } // small caps are placed before phonetic transcriptions of names, which we avoid else if (eltName.equals(XmlNames.ELT_SMALL)) { // we don't need them, and they can mess up NER tools } // we ignore certain types of span (phonetic trancription, WP buttons...) else if (eltName.equals(XmlNames.ELT_SPAN)) { processSpanElement(element, rawStr, linkedStr); } // hyperlinks must be included in the linked string, provided they are not external else if (eltName.equals(XmlNames.ELT_A)) { processHyperlinkElement(element, rawStr, linkedStr); } // lists else if (eltName.equals(XmlNames.ELT_UL)) { processListElement(element, rawStr, linkedStr, false); } else if (eltName.equals(XmlNames.ELT_OL)) { processListElement(element, rawStr, linkedStr, true); } else if (eltName.equals(XmlNames.ELT_DL)) { processDescriptionListElement(element, rawStr, linkedStr); } // list item else if (eltName.equals(XmlNames.ELT_LI)) { processTextElement(element, rawStr, linkedStr); } // divisions are just processed recursively else if (eltName.equals(XmlNames.ELT_DIV)) { processDivisionElement(element, rawStr, linkedStr); } // quotes are just processed recursively else if (eltName.equals(XmlNames.ELT_BLOCKQUOTE)) { processQuoteElement(element, rawStr, linkedStr); } // citation else if (eltName.equals(XmlNames.ELT_CITE)) { processParagraphElement(element, rawStr, linkedStr); } // other elements are considered as simple text else { String text = element.text(); rawStr.append(text); linkedStr.append(text); } } // text node else if (node instanceof TextNode) { // get the text TextNode textNode = (TextNode) node; String text = textNode.text(); // if at the begining of a new line, or already preceeded by a space, remove leading spaces while (rawStr.length() > 0 && (rawStr.charAt(rawStr.length() - 1) == '\n' || rawStr.charAt(rawStr.length() - 1) == ' ') && text.startsWith(" ")) text = text.substring(1); // complete string buffers rawStr.append(text); linkedStr.append(text); } } }
From source file:tr.edu.gsu.nerwip.retrieval.reader.wikipedia.WikipediaReader.java
/** * Pulls a text from a Wikipedia URL without images, tags, etc. * //from w w w . j a v a 2 s . c o m * @param url * Address of the targetted text. * @return * An Article object representing the retrieved object. * * @throws ReaderException * Problem while retrieving the text. */ @Override public Article read(URL url) throws ReaderException { Article result = null; String name = getName(url); try { // get the page String address = url.toString(); logger.log("Retrieving page " + address); long startTime = System.currentTimeMillis(); Document document = retrieveSourceCode(name, url); // get its title Element firstHeadingElt = document.getElementsByAttributeValue(XmlNames.ATT_ID, ID_TITLE).get(0); String title = firstHeadingElt.text(); logger.log("Get title: " + title); // get raw and linked texts logger.log("Get raw and linked texts."); StringBuilder rawStr = new StringBuilder(); StringBuilder linkedStr = new StringBuilder(); Element bodyContentElt = document.getElementsByAttributeValue(XmlNames.ATT_ID, ID_CONTENT).get(0); // processing each element in the content part boolean ignoringSection = false; boolean first = true; for (Element element : bodyContentElt.children()) { String eltName = element.tag().getName(); String eltClass = element.attr(XmlNames.ATT_CLASS); // section headers if (eltName.equals(XmlNames.ELT_H2)) { first = false; // get section name StringBuilder fakeRaw = new StringBuilder(); StringBuilder fakeLinked = new StringBuilder(); processParagraphElement(element, fakeRaw, fakeLinked); String str = fakeRaw.toString().trim().toLowerCase(Locale.ENGLISH); // check section name if (IGNORED_SECTIONS.contains(str)) ignoringSection = true; else { ignoringSection = false; rawStr.append("\n-----"); linkedStr.append("\n-----"); processParagraphElement(element, rawStr, linkedStr); } } else if (!ignoringSection) { // lower sections if (eltName.equals(XmlNames.ELT_H3) || eltName.equals(XmlNames.ELT_H4) || eltName.equals(XmlNames.ELT_H5) || eltName.equals(XmlNames.ELT_H6)) { first = false; processParagraphElement(element, rawStr, linkedStr); } // paragraph else if (eltName.equals(XmlNames.ELT_P)) { String str = element.text(); // ignore possible initial disambiguation link if (!first || !str.startsWith(PARAGRAPH_FORTHE)) { first = false; processParagraphElement(element, rawStr, linkedStr); } } // list else if (eltName.equals(XmlNames.ELT_UL)) { first = false; processListElement(element, rawStr, linkedStr, false); } else if (eltName.equals(XmlNames.ELT_OL)) { first = false; processListElement(element, rawStr, linkedStr, true); } else if (eltName.equals(XmlNames.ELT_DL)) { first = false; processDescriptionListElement(element, rawStr, linkedStr); } // tables else if (eltName.equals(XmlNames.ELT_TABLE)) { first = !processTableElement(element, rawStr, linkedStr); } // divisions else if (eltName.equals(XmlNames.ELT_DIV)) { // ignore possible initial picture if (!first || eltClass == null || !eltClass.contains(CLASS_THUMB)) first = !processDivisionElement(element, rawStr, linkedStr); } // we ignore certain types of span (phonetic trancription, WP buttons...) else if (eltName.equals(XmlNames.ELT_SPAN)) { first = !processSpanElement(element, rawStr, linkedStr); } // hyperlinks must be included in the linked string, provided they are not external else if (eltName.equals(XmlNames.ELT_A)) { first = !processHyperlinkElement(element, rawStr, linkedStr); } // quotes are just processed recursively else if (eltName.equals(XmlNames.ELT_BLOCKQUOTE)) { first = !processQuoteElement(element, rawStr, linkedStr); } // other tags are ignored } } // create article object result = new Article(name); result.setTitle(title); result.setUrl(url); result.initDate(); // clean text String rawText = rawStr.toString(); rawText = cleanText(rawText); // rawText = ArticleCleaning.replaceChars(rawText); result.setRawText(rawText); logger.log("Length of the raw text: " + rawText.length() + " chars."); String linkedText = linkedStr.toString(); linkedText = cleanText(linkedText); // linkedText = ArticleCleaning.replaceChars(linkedText); result.setLinkedText(linkedText); logger.log("Length of the linked text: " + linkedText.length() + " chars."); // get original html source code logger.log("Get original HTML source code."); String originalPage = document.toString(); result.setOriginalPage(originalPage); logger.log("Length of the original page: " + originalPage.length() + " chars."); // get the categories of the article List<ArticleCategory> categories = getArticleCategories(result); result.setCategories(categories); long endTime = System.currentTimeMillis(); logger.log("Total duration: " + (endTime - startTime) + " ms."); } catch (ClientProtocolException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (org.json.simple.parser.ParseException e) { e.printStackTrace(); } return result; }