Example usage for org.jsoup.nodes TextNode TextNode

Introduction

In this page you can find the example usage for org.jsoup.nodes TextNode TextNode.

Prototype

public TextNode(String text, String baseUri)

Source Link

Document

Create a new TextNode representing the supplied (unencoded) text).

Usage

From source file:isc_415_practica_1.ISC_415_Practica_1.java

/**
 * @param args the command line arguments
 *///from w w  w .j  a v  a2 s . co  m
public static void main(String[] args) {
    String urlString;
    Scanner input = new Scanner(System.in);
    Document doc;

    try {
        urlString = input.next();
        if (urlString.equals("servlet")) {
            urlString = "http://localhost:8084/ISC_415_Practica1_Servlet/client";
        }
        urlString = urlString.contains("http://") || urlString.contains("https://") ? urlString
                : "http://" + urlString;
        doc = Jsoup.connect(urlString).get();
    } catch (Exception ex) {
        System.out.println("El URL ingresado no es valido.");
        return;
    }

    ArrayList<NameValuePair> formInputParams;
    formInputParams = new ArrayList<>();
    String[] plainTextDoc = new TextNode(doc.html(), "").getWholeText().split("\n");
    System.out.println(String.format("Nmero de lineas del documento: %d", plainTextDoc.length));
    System.out.println(String.format("Nmero de p tags: %d", doc.select("p").size()));
    System.out.println(String.format("Nmero de img tags: %d", doc.select("img").size()));
    System.out.println(String.format("Nmero de form tags: %d", doc.select("form").size()));

    Integer index = 1;

    ArrayList<NameValuePair> urlParameters = new ArrayList<>();
    for (Element e : doc.select("form")) {
        System.out.println(String.format("Form %d: Nmero de Input tags %d", index, e.select("input").size()));
        System.out.println(e.select("input"));

        for (Element formInput : e.select("input")) {
            if (formInput.attr("id") != null && formInput.attr("id") != "") {
                urlParameters.add(new BasicNameValuePair(formInput.attr("id"), "PRACTICA1"));
            } else if (formInput.attr("name") != null && formInput.attr("name") != "") {
                urlParameters.add(new BasicNameValuePair(formInput.attr("name"), "PRACTICA1"));
            }
        }

        index++;
    }

    if (!urlParameters.isEmpty()) {
        try {
            CloseableHttpClient httpclient = HttpClients.createDefault();
            UrlEncodedFormEntity entity = new UrlEncodedFormEntity(urlParameters, Consts.UTF_8);
            HttpPost httpPost = new HttpPost(urlString);
            httpPost.setHeader("User-Agent", USER_AGENT);
            httpPost.setEntity(entity);
            HttpResponse response = httpclient.execute(httpPost);
            System.out.println(response.getStatusLine());
        } catch (IOException ex) {
            Logger.getLogger(ISC_415_Practica_1.class.getName()).log(Level.SEVERE, null, ex);
        }

    }

}

From source file:com.cognifide.aet.job.common.comparators.source.visitors.MarkupVisitor.java

@Override
public void visit(Node node) {
    if (node instanceof TextNode || node instanceof Comment || node instanceof DataNode) {
        node.replaceWith(new TextNode(StringUtils.EMPTY, node.baseUri()));
    }//  www  . ja  va  2 s  . co  m
}

From source file:com.jimplush.goose.outputformatters.DefaultOutputFormatter.java

/**
 * cleans up and converts any nodes that should be considered text into text
 *///from  w w w .  j a  va2s . co  m
private void convertLinksToText() {
    if (logger.isDebugEnabled()) {
        logger.debug("Turning links to text");
    }
    Elements links = topNode.getElementsByTag("a");
    for (Element item : links) {
        if (item.getElementsByTag("img").size() == 0) {
            TextNode tn = new TextNode(item.text(), topNode.baseUri());
            item.replaceWith(tn);
        }
    }
}

From source file:com.jimplush.goose.outputformatters.DefaultOutputFormatter.java

/**
 * replace common tags with just text so we don't have any crazy formatting issues
 * so replace <br>, <i>, <strong>, etc.... with whatever text is inside them
 *///from   w w w  .j  a v  a2s  .com
private void replaceTagsWithText() {

    Elements strongs = topNode.getElementsByTag("strong");
    for (Element item : strongs) {
        TextNode tn = new TextNode(item.text(), topNode.baseUri());
        item.replaceWith(tn);
    }

    Elements bolds = topNode.getElementsByTag("b");
    for (Element item : bolds) {
        TextNode tn = new TextNode(item.text(), topNode.baseUri());
        item.replaceWith(tn);
    }

    Elements italics = topNode.getElementsByTag("i");
    for (Element item : italics) {
        TextNode tn = new TextNode(item.text(), topNode.baseUri());
        item.replaceWith(tn);
    }
}

From source file:me.rkfg.xmpp.bot.plugins.CoolStoryPlugin.java

private String fetchStory(Website website) throws IOException {
    int roll = 0;
    String result;// w  ww.  ja  v a 2s  .c o  m
    int resultLength;
    int resultLines;

    //noinspection ConstantConditions
    do {
        roll++;

        final Document doc = Jsoup.connect(website.getUrlString()).userAgent(DEFAULT_UA).get();
        doc.outputSettings(new Document.OutputSettings().prettyPrint(false));
        logger.info("Fetched a story from {}", doc.location());

        final Element story = doc.select(website.getCssQuery()).first();
        if (story == null) {
            return ERROR_COULD_NOT_PARSE;
        }

        story.select("div").remove();
        story.select("img").forEach(img -> img.replaceWith(new TextNode(img.attr("src"), "")));
        story.select("br").after("\\n");
        story.select("p").before("\\n\\n");
        final String storyHtml = story.html().replaceAll("\\\\n", "\n");

        result = Jsoup.clean(storyHtml, "", Whitelist.none(), new Document.OutputSettings().prettyPrint(false))
                .trim();
        resultLength = result.length();
        resultLines = countLines(result);

    } while (CONFIG_REROLL_LONG_STORIES
            && (resultLength > CONFIG_MAX_STORY_LENGTH || resultLines > CONFIG_MAX_STORY_LINES)
            && roll <= CONFIG_MAX_ROLLS);

    return result;
}

From source file:us.colloquy.sandbox.TestExtractor.java

@Test
public void useJsoup() {

    String homeDir = System.getProperty("user.home");

    System.out.println(homeDir);//ww w  . j av  a 2  s  .co m

    //JSOUP API allows to extract all  elements of letters in files

    // File input = new File("samples/OEBPS/Text/0001_1006_2001.xhtml");

    File input = new File("samples/pisma-1904/OEBPS/Text/single_doc.html");

    try {
        Document doc = Jsoup.parse(input, "UTF-8");

        List<Letter> letters = new ArrayList<>(); //our model contains only a subset of fields

        String previousYear = "";

        for (Element element : doc.getElementsByClass("section")) {
            Letter letter = new Letter();

            StringBuilder content = new StringBuilder();

            for (Element child : element.children()) {

                for (Attribute att : child.attributes()) {
                    System.out.println(att.getKey() + " " + att.getValue());
                }

                if ("center".equalsIgnoreCase(child.className())) {
                    String toWhom = child.getElementsByTag("strong").text();

                    if (StringUtils.isEmpty(toWhom)) {
                        toWhom = child.text();
                        // System.out.println(toWhom);
                    }

                    String[] toWhomArray = toWhom.split("(\\s\\s)|(,)");

                    for (String to : toWhomArray) {
                        RussianDate.parseToWhom(letter, to); //here we need to recognize a russian name and store that but for now we store the content
                    }

                    //check if there is anything else here and find date and place - it will be replaced if exists below

                    String entireText = child.text();

                    String tail = entireText.replace(toWhom, "");

                    if (StringUtils.isNotEmpty(tail)) {
                        RussianDate.parseDateAndPlace(letter, tail, previousYear); //a parser that figures out date and place if they are present
                    }

                    // System.out.println("two whom\t " +  child.getElementsByTag("strong").text() );

                } else if ("Data".equalsIgnoreCase(child.className())) {

                    if (child.getElementsByTag("em") != null
                            && StringUtils.isNotEmpty(child.getElementsByTag("em").text())) {
                        RussianDate.parseDateAndPlace(letter, child.getElementsByTag("em").text(),
                                previousYear); //most often date and place are enclosed in em tag

                        if (letter.getDate() != null) {
                            LocalDate localDate = letter.getDate().toInstant().atZone(ZoneId.systemDefault())
                                    .toLocalDate();
                            int year = localDate.getYear();
                            previousYear = year + "";
                        }
                    }

                    // System.out.println("when and where\t " + child.getElementsByTag("em").text());

                } else if ("petit".equalsIgnoreCase(child.className())
                        || "Textpetit_otstup".equalsIgnoreCase(child.className())) {
                    letter.getNotes().add(child.text());

                } else {
                    //System.out.println(child.text() );

                    Elements elements = child.getElementsByTag("sup");

                    for (Element e : elements) {
                        String value = e.text();

                        e.replaceWith(new TextNode("[" + value + "]", null));
                    }

                    for (Element el : child.getAllElements()) {
                        // System.out.println(el.tagName());
                        if ("sup".equalsIgnoreCase(el.tagName())) {
                            content.append(" [" + el.text() + "] ");
                        } else {
                            content.append(el.text());
                        }

                    }

                    content.append("\n");

                }

                //                  System.out.println(child.tag() + "\n" );
                //                  System.out.println(child.outerHtml() + "\n" + child.text());
            }

            letter.setContent(content.toString());
            letters.add(letter);
        }

        ObjectWriter ow = new com.fasterxml.jackson.databind.ObjectMapper().writer().withDefaultPrettyPrinter();

        for (Letter letter : letters) {
            //                if (letter.getDate() == null)
            //                {

            //                        if (StringUtils.isNotEmpty(person.getLastName()))
            //                        {
            String json = ow.writeValueAsString(letter);

            System.out.println(json);
            //                        }

            //}

        }

    } catch (IOException e) {
        e.printStackTrace();
    }

}

From source file:us.colloquy.util.DiaryParser.java

private static void replaceSupTag(Element child) {
        Elements elements = child.getElementsByTag("sup");

        for (Element e : elements) {
            String value = e.text();

            e.replaceWith(new TextNode("[" + value + "]", null));
        }//from   ww  w  .  j  av a 2  s  . co  m

    }