List of usage examples for org.jsoup.nodes TextNode TextNode
public TextNode(String text, String baseUri)
From source file:isc_415_practica_1.ISC_415_Practica_1.java
/** * @param args the command line arguments *///from w w w .j a v a2 s . co m public static void main(String[] args) { String urlString; Scanner input = new Scanner(System.in); Document doc; try { urlString = input.next(); if (urlString.equals("servlet")) { urlString = "http://localhost:8084/ISC_415_Practica1_Servlet/client"; } urlString = urlString.contains("http://") || urlString.contains("https://") ? urlString : "http://" + urlString; doc = Jsoup.connect(urlString).get(); } catch (Exception ex) { System.out.println("El URL ingresado no es valido."); return; } ArrayList<NameValuePair> formInputParams; formInputParams = new ArrayList<>(); String[] plainTextDoc = new TextNode(doc.html(), "").getWholeText().split("\n"); System.out.println(String.format("Nmero de lineas del documento: %d", plainTextDoc.length)); System.out.println(String.format("Nmero de p tags: %d", doc.select("p").size())); System.out.println(String.format("Nmero de img tags: %d", doc.select("img").size())); System.out.println(String.format("Nmero de form tags: %d", doc.select("form").size())); Integer index = 1; ArrayList<NameValuePair> urlParameters = new ArrayList<>(); for (Element e : doc.select("form")) { System.out.println(String.format("Form %d: Nmero de Input tags %d", index, e.select("input").size())); System.out.println(e.select("input")); for (Element formInput : e.select("input")) { if (formInput.attr("id") != null && formInput.attr("id") != "") { urlParameters.add(new BasicNameValuePair(formInput.attr("id"), "PRACTICA1")); } else if (formInput.attr("name") != null && formInput.attr("name") != "") { urlParameters.add(new BasicNameValuePair(formInput.attr("name"), "PRACTICA1")); } } index++; } if (!urlParameters.isEmpty()) { try { CloseableHttpClient httpclient = HttpClients.createDefault(); UrlEncodedFormEntity entity = new UrlEncodedFormEntity(urlParameters, Consts.UTF_8); HttpPost httpPost = new HttpPost(urlString); httpPost.setHeader("User-Agent", USER_AGENT); httpPost.setEntity(entity); HttpResponse response = httpclient.execute(httpPost); System.out.println(response.getStatusLine()); } catch (IOException ex) { Logger.getLogger(ISC_415_Practica_1.class.getName()).log(Level.SEVERE, null, ex); } } }
From source file:com.cognifide.aet.job.common.comparators.source.visitors.MarkupVisitor.java
@Override public void visit(Node node) { if (node instanceof TextNode || node instanceof Comment || node instanceof DataNode) { node.replaceWith(new TextNode(StringUtils.EMPTY, node.baseUri())); }// www . ja va 2 s . co m }
From source file:com.jimplush.goose.outputformatters.DefaultOutputFormatter.java
/** * cleans up and converts any nodes that should be considered text into text *///from w w w . j a va2s . co m private void convertLinksToText() { if (logger.isDebugEnabled()) { logger.debug("Turning links to text"); } Elements links = topNode.getElementsByTag("a"); for (Element item : links) { if (item.getElementsByTag("img").size() == 0) { TextNode tn = new TextNode(item.text(), topNode.baseUri()); item.replaceWith(tn); } } }
From source file:com.jimplush.goose.outputformatters.DefaultOutputFormatter.java
/** * replace common tags with just text so we don't have any crazy formatting issues * so replace <br>, <i>, <strong>, etc.... with whatever text is inside them *///from w w w .j a v a2s .com private void replaceTagsWithText() { Elements strongs = topNode.getElementsByTag("strong"); for (Element item : strongs) { TextNode tn = new TextNode(item.text(), topNode.baseUri()); item.replaceWith(tn); } Elements bolds = topNode.getElementsByTag("b"); for (Element item : bolds) { TextNode tn = new TextNode(item.text(), topNode.baseUri()); item.replaceWith(tn); } Elements italics = topNode.getElementsByTag("i"); for (Element item : italics) { TextNode tn = new TextNode(item.text(), topNode.baseUri()); item.replaceWith(tn); } }
From source file:me.rkfg.xmpp.bot.plugins.CoolStoryPlugin.java
private String fetchStory(Website website) throws IOException { int roll = 0; String result;// w ww. ja v a 2s .c o m int resultLength; int resultLines; //noinspection ConstantConditions do { roll++; final Document doc = Jsoup.connect(website.getUrlString()).userAgent(DEFAULT_UA).get(); doc.outputSettings(new Document.OutputSettings().prettyPrint(false)); logger.info("Fetched a story from {}", doc.location()); final Element story = doc.select(website.getCssQuery()).first(); if (story == null) { return ERROR_COULD_NOT_PARSE; } story.select("div").remove(); story.select("img").forEach(img -> img.replaceWith(new TextNode(img.attr("src"), ""))); story.select("br").after("\\n"); story.select("p").before("\\n\\n"); final String storyHtml = story.html().replaceAll("\\\\n", "\n"); result = Jsoup.clean(storyHtml, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false)) .trim(); resultLength = result.length(); resultLines = countLines(result); } while (CONFIG_REROLL_LONG_STORIES && (resultLength > CONFIG_MAX_STORY_LENGTH || resultLines > CONFIG_MAX_STORY_LINES) && roll <= CONFIG_MAX_ROLLS); return result; }
From source file:us.colloquy.sandbox.TestExtractor.java
@Test public void useJsoup() { String homeDir = System.getProperty("user.home"); System.out.println(homeDir);//ww w . j av a 2 s .co m //JSOUP API allows to extract all elements of letters in files // File input = new File("samples/OEBPS/Text/0001_1006_2001.xhtml"); File input = new File("samples/pisma-1904/OEBPS/Text/single_doc.html"); try { Document doc = Jsoup.parse(input, "UTF-8"); List<Letter> letters = new ArrayList<>(); //our model contains only a subset of fields String previousYear = ""; for (Element element : doc.getElementsByClass("section")) { Letter letter = new Letter(); StringBuilder content = new StringBuilder(); for (Element child : element.children()) { for (Attribute att : child.attributes()) { System.out.println(att.getKey() + " " + att.getValue()); } if ("center".equalsIgnoreCase(child.className())) { String toWhom = child.getElementsByTag("strong").text(); if (StringUtils.isEmpty(toWhom)) { toWhom = child.text(); // System.out.println(toWhom); } String[] toWhomArray = toWhom.split("(\\s\\s)|(,)"); for (String to : toWhomArray) { RussianDate.parseToWhom(letter, to); //here we need to recognize a russian name and store that but for now we store the content } //check if there is anything else here and find date and place - it will be replaced if exists below String entireText = child.text(); String tail = entireText.replace(toWhom, ""); if (StringUtils.isNotEmpty(tail)) { RussianDate.parseDateAndPlace(letter, tail, previousYear); //a parser that figures out date and place if they are present } // System.out.println("two whom\t " + child.getElementsByTag("strong").text() ); } else if ("Data".equalsIgnoreCase(child.className())) { if (child.getElementsByTag("em") != null && StringUtils.isNotEmpty(child.getElementsByTag("em").text())) { RussianDate.parseDateAndPlace(letter, child.getElementsByTag("em").text(), previousYear); //most often date and place are enclosed in em tag if (letter.getDate() != null) { LocalDate localDate = letter.getDate().toInstant().atZone(ZoneId.systemDefault()) .toLocalDate(); int year = localDate.getYear(); previousYear = year + ""; } } // System.out.println("when and where\t " + child.getElementsByTag("em").text()); } else if ("petit".equalsIgnoreCase(child.className()) || "Textpetit_otstup".equalsIgnoreCase(child.className())) { letter.getNotes().add(child.text()); } else { //System.out.println(child.text() ); Elements elements = child.getElementsByTag("sup"); for (Element e : elements) { String value = e.text(); e.replaceWith(new TextNode("[" + value + "]", null)); } for (Element el : child.getAllElements()) { // System.out.println(el.tagName()); if ("sup".equalsIgnoreCase(el.tagName())) { content.append(" [" + el.text() + "] "); } else { content.append(el.text()); } } content.append("\n"); } // System.out.println(child.tag() + "\n" ); // System.out.println(child.outerHtml() + "\n" + child.text()); } letter.setContent(content.toString()); letters.add(letter); } ObjectWriter ow = new com.fasterxml.jackson.databind.ObjectMapper().writer().withDefaultPrettyPrinter(); for (Letter letter : letters) { // if (letter.getDate() == null) // { // if (StringUtils.isNotEmpty(person.getLastName())) // { String json = ow.writeValueAsString(letter); System.out.println(json); // } //} } } catch (IOException e) { e.printStackTrace(); } }
From source file:us.colloquy.util.DiaryParser.java
private static void replaceSupTag(Element child) { Elements elements = child.getElementsByTag("sup"); for (Element e : elements) { String value = e.text(); e.replaceWith(new TextNode("[" + value + "]", null)); }//from ww w . j av a 2 s . co m }