Example usage for org.jsoup.nodes Document title

Introduction

In this page you can find the example usage for org.jsoup.nodes Document title.

Prototype

public String title()

Source Link

Document

Get the string contents of the document's title element.

Usage

From source file:io.github.carlomicieli.footballdb.starter.parsers.SeasonGamesParser.java

protected static String extractYear(Document doc) {
    return doc.title().substring(0, 4);
}

From source file:io.github.carlomicieli.footballdb.starter.documents.WebDocumentDownloader.java

private static Document downloadFromURL(String url) {
    validateUrl(url);//  w  w  w . j a  v a 2  s  .co m

    try {
        Document doc = Jsoup.connect(url).userAgent(CHROME_USER_AGENT).get();

        App.log().info("Downloading '{}'...", doc.title());
        return doc;
    } catch (IOException e) {
        App.log().error("Error for '{}': {}", url, e);
        return null;
    }
}

From source file:FILER.java

public static String[] Dealing_Files(File f) throws IOException //return array of important strings in the file
{
    Text = "";
    String[] Importants = { "", "", "", "" }; //first element is the title,second is all headers,third is img alt,4th is the url
    org.jsoup.nodes.Document doc = Jsoup.parse(f, "UTF-8");
    Importants[0] = doc.title(); //get the title of the file
    //Text=Text+" "+doc.title(); 
    String tag = "h";
    String All_Headers = "";
    Elements Header;/*from  w w w .  j  a  v a  2s  .  com*/
    for (int i = 1; i < 20; i++) //loop to get text with headers tag of the file
    {
        tag = "h" + String.valueOf(i);
        Header = doc.select(tag);
        if (Header.size() > 0) {
            Header = doc.getElementsByTag(tag);
            String pConcatenated = "";
            for (Element x : Header) {
                pConcatenated += x.text() + " ";
            }
            All_Headers = All_Headers + pConcatenated;
        } else
            break;

    }
    Importants[1] = All_Headers;
    Text = Text + " " + doc.text(); //get the text of the document
    Elements img = doc.getElementsByTag("img"); //get the text with img tag 
    for (Element element : img) {
        if (element.attr("alt") != null && !(element.attr("alt").equals(""))) {
            Text = Text + " " + element.attr("alt");
            Importants[2] = Importants[2] + " " + element.attr("alt");
        }
    }
    return Importants;
}

From source file:com.zack6849.alphabot.api.Utils.java

public static String getTitle(String link) {
    String response = "";
    try {//from   ww w .j  av  a  2 s .co m
        HttpURLConnection conn = (HttpURLConnection) new URL(link).openConnection();
        conn.addRequestProperty("User-Agent", USER_AGENT);
        String type = conn.getContentType();
        int length = conn.getContentLength() / 1024;
        response = String.format("HTTP %s: %s", conn.getResponseCode(), conn.getResponseMessage());
        String info;
        if (type.contains("text") || type.contains("application")) {
            Document doc = Jsoup.connect(link).userAgent(USER_AGENT).followRedirects(true).get();
            String title = doc.title() == null || doc.title().isEmpty() ? "No title found!" : doc.title();
            info = String.format("%s - (Content Type: %s Size: %skb)", title, type, length);
            return info;
        }
        info = String.format("Content Type: %s Size: %skb", type, length);
        return info;

    } catch (IOException ex) {
        if (ex.getMessage().contains("UnknownHostException")) {
            return Colors.RED + "Unknown hostname!";
        }
        return response.isEmpty() ? Colors.RED + "An error occured" : response;
    }
}

From source file:index.IndexManager.java

public static Triple<SolrInputDocument, Collection<String>, Collection<String>> index(Document document) {
    final SolrInputDocument index = new SolrInputDocument();
    index.setField("id", document.location());
    index.setField("time", String.valueOf(System.currentTimeMillis()));
    index.setField("title", document.title());

    final Set<String> links = document.select("a[href]").stream().map(e -> e.attr("abs:href"))
            .collect(Collectors.toSet());
    final Set<String> media = document.select("[src]").stream().map(e -> e.attr("abs:src"))
            .collect(Collectors.toSet());

    links.forEach(link -> index.addField("link", link));
    media.forEach(link -> index.addField("media", link));

    formatText(document.getElementsByTag("h1").stream()).forEach(e -> index.addField("h1", e));

    formatText(document.getElementsByTag("h2").stream()).forEach(e -> index.addField("h2", e));

    formatText(document.getElementsByTag("h3").stream()).forEach(e -> index.addField("h3", e));

    formatText(document.getElementsByTag("strong").stream()).forEach(e -> index.addField("strong", e));

    formatText(document.getElementsByTag("em").stream()).forEach(e -> index.addField("em", e));

    formatText(document.getElementsByTag("b").stream()).forEach(e -> index.addField("b", e));

    formatText(document.getElementsByTag("u").stream()).forEach(e -> index.addField("u", e));

    formatText(document.getElementsByTag("i").stream()).forEach(e -> index.addField("i", e));

    int i = 0;//from   w ww . ja va  2 s .c o  m
    Collection<String> text = chunkToLength(document.text());
    for (String chunk : text)
        index.addField(++i + "_text", chunk);

    return Triple.of(index, links, media);
}

From source file:com.nuance.expertassistant.ContentExtractor.java

public static void extract(Document doc) {

    final Elements links = doc.getElementsByTag("a");
    final Elements ps = doc.select("p");

    final String title = doc.title();

    print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(doc.title()) + "\">");

    final Elements elements = doc.select("*");

    final ArrayList<String> openHeaderList = new ArrayList<String>();

    for (final Element element : elements) {
        if (element.ownText() == null || element.ownText().isEmpty() || element.ownText().trim() == "") {

        } else if (element.tagName().toString().contains("a")) {

        } else if (element.tagName().contains("h1") && element.text() != null && !element.text().isEmpty()) {

            if (openHeaderList.contains("h1")) {
                openHeaderList.remove("h1");
                print("</section>");
            }/*from  w  w w  . j ava 2 s.  c  om*/
            if (openHeaderList.contains("h2")) {
                openHeaderList.remove("h2");
                print("</section>");
            }
            if (openHeaderList.contains("h3")) {
                openHeaderList.remove("h3");
                print("</section>");
            }
            if (openHeaderList.contains("h4")) {
                openHeaderList.remove("h4");
                print("</section>");
            }

            print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">");
            openHeaderList.add("h1");

        } else if (element.tagName().contains("h2") && element.text() != null && !element.text().isEmpty()) {

            if (openHeaderList.contains("h2")) {
                openHeaderList.remove("h2");
                print("</section>");
            }
            if (openHeaderList.contains("h3")) {
                openHeaderList.remove("h3");
                print("</section>");
            }
            if (openHeaderList.contains("h4")) {
                openHeaderList.remove("h4");
                print("</section>");
            }

            print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">");
            openHeaderList.add("h2");

        } else if (element.tagName().contains("h3") && element.text() != null && !element.text().isEmpty()) {

            if (openHeaderList.contains("h3")) {
                openHeaderList.remove("h3");
                print("</section>");
            }
            if (openHeaderList.contains("h4")) {
                openHeaderList.remove("h4");
                print("</section>");
            }

            print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">");
            openHeaderList.add("h3");

        } else if (element.tagName().contains("h4") && element.text() != null && !element.text().isEmpty()) {

            if (openHeaderList.contains("h4")) {
                openHeaderList.remove("h4");
                print("</section>");
            }

            print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">");
            openHeaderList.add("h4");

        }

        else {
            print("<para>");
            print(stripNonValidXMLCharacters(element.ownText()));
            print("</para>");
        }

        /*
         * if (element.tagName().contains("img")) { print("<img src=\"" +
         * element.attr("src") + "\"></img>"); }
         */
    }

    if (openHeaderList.contains("h1")) {
        openHeaderList.remove("h1");
        print("</section>");
    }
    if (openHeaderList.contains("h2")) {
        openHeaderList.remove("h2");
        print("</section>");
    }
    if (openHeaderList.contains("h3")) {
        openHeaderList.remove("h3");
        print("</section>");
    }
    if (openHeaderList.contains("h4")) {
        openHeaderList.remove("h4");
        print("</section>");
    }

    print("</section>");

}

From source file:io.github.carlomicieli.footballdb.starter.parsers.DraftParser.java

protected Year extractYear(Document doc) {
    String title = doc.title();
    return Year.parse(title.substring(0, 4));
}

From source file:co.foxdev.foxbot.utils.Utils.java

public static String parseChatUrl(String stringToParse, User sender) {
    try {/*w  w w  . j av  a 2  s  .c o m*/
        Connection conn = Jsoup.connect(stringToParse);

        conn.followRedirects(true).userAgent(
                "FoxBot // http://foxbot.foxdev.co // Seeing this? It means your web address was posted on IRC and FoxBot is getting page info (title, size, content type) to send to the channel. Nothing to worry about.")
                .timeout(3000).maxBodySize(100000).ignoreContentType(true);

        Connection.Response response = conn.execute();
        Document doc = response.parse();
        String size = response.header("Content-Length") == null ? "Unknown"
                : (Integer.parseInt(response.header("Content-Length")) / 1024) + "kb";
        String contentType = response.contentType().contains(";") ? response.contentType().split(";")[0]
                : response.contentType();

        if (response.statusCode() != 200 && response.statusCode() != 302 && response.statusCode() != 301) {
            return colourise(String.format("(%s's URL) &cError: &r%s %s ", munge(sender.getNick()),
                    response.statusCode(), response.statusMessage()));
        }

        if (!contentType.contains("html")) {
            return colourise(String.format("(%s's URL) &2Content Type: &r%s &2Size: &r%s",
                    munge(sender.getNick()), contentType, size));
        }

        String title = doc.title() == null || doc.title().isEmpty() ? "No title found" : doc.title();

        if (stringToParse.matches("^https?://(www\\.)?youtube\\.com/watch.*")) {
            title = doc.select("span#eow-title").first().text();
            String views = doc.select("span.watch-view-count").first().text();
            String likes = doc.select("span.likes-count").first().text();
            String dislikes = doc.select("span.dislikes-count").first().text();
            String uploader = doc.select("a.g-hovercard.yt-uix-sessionlink.yt-user-name").first().text();

            return colourise(String.format(
                    "(%s's URL) &2Title: &r%s &2Uploader: &r%s &2Views: &r%s &2Rating: &a%s&r/&c%s",
                    munge(sender.getNick()), StringEscapeUtils.unescapeHtml4(title), uploader, views, likes,
                    dislikes));
        }

        if (stringToParse.matches("^https?://(www\\.)?reddit\\.com/r/.*/comments/.*")) {
            String poster = doc.select("p.tagline").select("a.author").text().split(" ")[0];
            String comments = doc.select("a.comments").first().text().split(" ")[0];
            String likes = doc.select("span.upvotes").first().text().split(" ")[0];
            String dislikes = doc.select("span.downvotes").first().text().split(" ")[0];

            return colourise(String.format(
                    "(%s's URL) &2Title: &r%s &2Poster: &r%s &2Comments: &r%s &2Rating: &6%s&r/&9%s",
                    munge(sender.getNick()), StringEscapeUtils.unescapeHtml4(title), poster, comments, likes,
                    dislikes));
        }
        return colourise(String.format("(%s's URL) &2Title: &r%s &2Content Type: &r%s &2Size: &r%s",
                munge(sender.getNick()), StringEscapeUtils.unescapeHtml4(title), contentType, size));
    } catch (IllegalArgumentException ignored) {

    } catch (Exception ex) {
        foxbot.getLogger().error("Error occurred while parsing URL", ex);
    }
    return null;
}

From source file:cn.edu.hfut.dmic.webcollector.example.TutorialCrawler2.java

/**
 * ???????visit//from   ww w .  j a v a  2  s  .co  m
 * @param page
 * @param nextLinks ???URLautoParsetrue??nextLinks
 */
@Override
public void visit(Page page, Links nextLinks) {
    Document doc = page.getDoc();
    String title = doc.title();
    System.out.println("URL:" + page.getUrl() + "  :" + title);

    /*??mysql*/
    if (jdbcTemplate != null) {
        int updates = jdbcTemplate.update("insert into tb_content (title,url,html) value(?,?,?)", title,
                page.getUrl(), page.getHtml());
        if (updates == 1) {
            System.out.println("mysql??");
        }
    }

    /*
    //nextLinksx?URL?????URL
    //?????
    //?????Crawler.addForcedSeedURL
     nextLinks.add("http://www.csdn.net");
    */
}

From source file:org.brunocvcunha.taskerbox.impl.crawler.SlexyAction.java

@Override
public void action(final Document entry) {

    log.debug("Validating " + entry.title());

    for (Element el : entry.select(".main").select("a")) {
        final String id = el.attr("href").replace("/view/", "");

        final String title = id;

        if (canAct(id)) {
            addAct(id);/*www . ja va 2 s.com*/

            spreadAction(id, title);
            serializeAlreadyAct();
            sleep(FETCH_INTERVAL);
        }

    }

}