Example usage for org.jsoup.nodes Document getElementsByTag

List of usage examples for org.jsoup.nodes Document getElementsByTag


In this page you can find the example usage for org.jsoup.nodes Document getElementsByTag.


public Elements getElementsByTag(String tagName) 

Source Link


Finds elements, including and recursively under this element, with the specified tag name.


From source file:gpxparser.GpxParser.java

 * @param args the command line arguments
 *///from  w w  w  .java2  s .co m
public static void main(String[] args) {
    File input = new File("/home/yonseca/4.gpx");
    Track track = new Track();
    try {
        Document doc = Jsoup.parse(input, "UTF-8");
        Elements trackData = doc.getElementsByTag("trk");
        Elements trackName = trackData.select("name");
        Elements trkPt = trackData.select("trkseg").select("trkpt");
        for (Iterator<Element> iterator = trkPt.iterator(); iterator.hasNext();) {
            Element dataPoint = iterator.next();
            Double lat = NumberUtils.toDouble(dataPoint.attr("lat"));
            Double lon = NumberUtils.toDouble(dataPoint.attr("lon"));
            Double altitude = NumberUtils.toDouble(dataPoint.select("ele").text());
            track.addPoint(lat, lon, altitude);

    } catch (IOException ex) {
        Logger.getLogger(GpxParser.class.getName()).log(Level.SEVERE, null, ex);

From source file:DataCrawler.OpenAIRE.XMLGenerator.java

public static void main(String[] args) {
    String text = "";

    try {//w ww .j  a v a  2s . co m
        if (args.length < 4) {
            System.out.println("<command> template_file csv_file output_dir log_file [start_id]");

        // InputStream fis = new FileInputStream("E:/Downloads/result-r-00000");
        InputStream fis = new FileInputStream(args[1]);
        BufferedReader br = new BufferedReader(new InputStreamReader(fis, Charset.forName("UTF-8")));

        // String content = new String(Files.readAllBytes(Paths.get("publications_template.xml")));
        String content = new String(Files.readAllBytes(Paths.get(args[0])));
        Document doc = Jsoup.parse(content, "UTF-8", Parser.xmlParser());
        // String outputDirectory = "G:/";
        String outputDirectory = args[2];
        // PrintWriter logWriter = new PrintWriter(new FileOutputStream("publication.log",false));
        PrintWriter logWriter = new PrintWriter(new FileOutputStream(args[3], false));
        Element objectId = null, title = null, publisher = null, dateofacceptance = null, bestlicense = null,
                resulttype = null, originalId = null, originalId2 = null;
        boolean start = true;
        // String startID = "dedup_wf_001::207a098867b64f3b5af505fa3aeecd24";
        String startID = "";
        if (args.length >= 5) {
            start = false;
            startID = args[4];
        String previousText = "";
        while ((text = br.readLine()) != null) {
            /*  For publications:
                0. dri:objIdentifier context
               9. title context
               12. publisher context
               18. dateofacceptance
               19. bestlicense @classname
               21. resulttype  @classname
               26. originalId context  
               (Notice that the prefix is null and will use space to separate two different "originalId")

            if (!previousText.isEmpty()) {
                text = previousText + text;
                start = true;
                previousText = "";

            String[] items = text.split("!");
            for (int i = 0; i < items.length; ++i) {
                items[i] = StringUtils.strip(items[i], "#");
            if (objectId == null)
                objectId = doc.getElementsByTag("dri:objIdentifier").first();

            if (!start && items[0].equals(startID)) {
                start = true;

            if (title == null)
                title = doc.getElementsByTag("title").first();

            if (publisher == null)
                publisher = doc.getElementsByTag("publisher").first();

            if (items.length < 12) {
                previousText = text;

            if (dateofacceptance == null)
                dateofacceptance = doc.getElementsByTag("dateofacceptance").first();

            if (bestlicense == null)
                bestlicense = doc.getElementsByTag("bestlicense").first();
            bestlicense.attr("classname", items[19]);

            if (resulttype == null)
                resulttype = doc.getElementsByTag("resulttype").first();
            resulttype.attr("classname", items[21]);

            if (originalId == null || originalId2 == null) {
                Elements elements = doc.getElementsByTag("originalId");
                String[] context = items[26].split(" ");
                if (elements.size() > 0) {
                    if (elements.size() >= 1) {
                        originalId = elements.get(0);
                        if (context.length >= 1) {
                            int indexOfnull = context[0].trim().indexOf("null");
                            String value = "";
                            if (indexOfnull != -1) {
                                if (context[0].trim().length() >= (indexOfnull + 5))
                                    value = context[0].trim().substring(indexOfnull + 5);

                            } else {
                                value = context[0].trim();
                    if (elements.size() >= 2) {
                        originalId2 = elements.get(1);
                        if (context.length >= 2) {
                            int indexOfnull = context[1].trim().indexOf("null");
                            String value = "";
                            if (indexOfnull != -1) {
                                if (context[1].trim().length() >= (indexOfnull + 5))
                                    value = context[1].trim().substring(indexOfnull + 5);

                            } else {
                                value = context[1].trim();
            } else {
                String[] context = items[26].split(" ");
                if (context.length >= 1) {
                    int indexOfnull = context[0].trim().indexOf("null");
                    String value = "";
                    if (indexOfnull != -1) {
                        if (context[0].trim().length() >= (indexOfnull + 5))
                            value = context[0].trim().substring(indexOfnull + 5);

                    } else {
                        value = context[0].trim();
                if (context.length >= 2) {
                    int indexOfnull = context[1].trim().indexOf("null");
                    String value = "";
                    if (indexOfnull != -1) {
                        if (context[1].trim().length() >= (indexOfnull + 5))
                            value = context[1].trim().substring(indexOfnull + 5);

                    } else {
                        value = context[1].trim();
            if (start) {
                String filePath = outputDirectory + items[0].replace(":", "#") + ".xml";
                PrintWriter writer = new PrintWriter(new FileOutputStream(filePath, false));
                logWriter.write(filePath + " > Start" + System.lineSeparator());
                writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + System.lineSeparator());
                logWriter.write(filePath + " > OK" + System.lineSeparator());

    } catch (Exception e) {

From source file:com.geecko.QuickLyric.lyrics.MetalArchives.java

public static Lyrics fromMetaData(String artist, String title) {
    String baseURL = "http://www.metal-archives.com/search/ajax-advanced/searching/songs/?bandName=%s&songTitle=%s&releaseType[]=1&exactSongMatch=1&exactBandMatch=1";
    String urlArtist = artist.replaceAll("\\s", "+");
    String urlTitle = title.replaceAll("\\s", "+");
    String url;//  w  w w  .jav a  2  s . c  o m
    String text;
    try {
        String response = Net.getUrlAsString(String.format(baseURL, urlArtist, urlTitle));
        JSONObject jsonResponse = new JSONObject(response);
        JSONArray track = jsonResponse.getJSONArray("aaData").getJSONArray(0);
        StringBuilder builder = new StringBuilder();
        for (int i = 0; i < track.length(); i++)
        Document trackDocument = Jsoup.parse(builder.toString());
        url = trackDocument.getElementsByTag("a").get(1).attr("href");
        String id = trackDocument.getElementsByClass("viewLyrics").get(0).id().substring(11);
        text = Jsoup.connect("http://www.metal-archives.com/release/ajax-view-lyrics/id/" + id).get().body()
    } catch (IOException e) {
        return new Lyrics(ERROR);
    } catch (JSONException e) {
        return new Lyrics(NO_RESULT);
    Lyrics lyrics = new Lyrics(POSITIVE_RESULT);

    return lyrics;

From source file:com.bibisco.export.HTMLParser.java

public static void parse(String pStrHTML, IExporter pExporter) {

    mLog.debug("Start parse(String, IExporter)");

    // parse html
    Document lDocument = Jsoup.parse(pStrHTML);

    // get body element
    Element lElementBody = lDocument.getElementsByTag("body").get(0);

    // parse body
    parseNode(lElementBody, new TextFormatting(), pExporter);

    mLog.debug("End parse(String, IExporter)");

From source file:damo.three.ie.util.HtmlUtilities.java

 * Parses the My3 account usage page to nicer JSON format.
 * @param pageContent Page content as HTML.
 * @return Usage information stripped out and formatted as JSON.
 * @throws JSONException/*from   w w w.  j  a v  a2 s. c  om*/
public static JSONArray parseUsageAsJSONArray(String pageContent) throws JSONException {
    // The HTML on prepay is pig-ugly, so we will use JSoup to
    // clean and parse it.
    Document doc = Jsoup.parse(pageContent);

    Elements elements = doc.getElementsByTag("table");

    JSONArray jsonArray = new JSONArray();

    // three don't have a sub label for the 3-to-3 calls, which is not consistent with other items.
    // .. feck them!
    boolean three2threeCallsBug = false;

    for (Element element : elements) {

        for (Element subelement : element.select("tbody > tr")) {

            if ((subelement.text().contains("3 to 3 Calls")) && (subelement.text().contains("Valid until"))) {
                three2threeCallsBug = true;

            Elements subsubelements = subelement.select("td");

            if (subsubelements.size() == 3) {

                // skip the "total" entries
                if (subsubelements.select("td").get(0).text().contains("Total")) {

                JSONObject currentItem = new JSONObject();

                if (three2threeCallsBug) {
                    currentItem.put("item", "3 to 3 Calls");
                } else {
                    // Get rid of that "non-breaking space" character if it exists
                    String titleToClean = subsubelements.select("td").get(0).text().replace("\u00a0", "")
                    currentItem.put("item", titleToClean);

                 * Check if date contains "Today", if so, change it to a date.
                 * Otherwise we will never know when usage ends, unless user refreshes, As 'today'
                 * is 'today', tomorrow.. see!
                String value1 = subsubelements.select("td").get(1).text();
                if (value1.equals("Today")) {
                    DateTimeFormatter formatter = DateTimeFormat.forPattern("dd/MM/yy").withLocale(Locale.UK);
                    DateTime dt = new DateTime(); // current datetime
                    value1 = "Expires " + formatter.print(dt);
                currentItem.put("value1", value1);
                currentItem.put("value2", subsubelements.select("td").get(2).text());

                // Out of Bundle charges have an extra property
                if (currentItem.getString("item").startsWith("Internet")) {

                    Pattern p1 = Pattern.compile(Constants.OUT_OF_BUNDLE_REGEX, Pattern.DOTALL);
                    Matcher m1 = p1.matcher(pageContent);

                    StringBuilder cleanedDate = new StringBuilder();
                    if (m1.matches()) {
                        cleanedDate.append(' ');
                        cleanedDate.append(' ');
                        currentItem.put("value3", cleanedDate.toString());



        // reset the 3-to-3 call bug flag for next Element
        if (three2threeCallsBug) {
            three2threeCallsBug = false;

    return jsonArray;

From source file:org.keycloak.testsuite.util.saml.LoginBuilder.java

public static HttpUriRequest handleLoginPage(UserRepresentation user, String loginPage) {
    String username = user.getUsername();
    String password = getPasswordOf(user);
    org.jsoup.nodes.Document theLoginPage = Jsoup.parse(loginPage);

    List<NameValuePair> parameters = new LinkedList<>();
    for (Element form : theLoginPage.getElementsByTag("form")) {
        String method = form.attr("method");
        String action = form.attr("action");
        boolean isPost = method != null && "post".equalsIgnoreCase(method);

        for (Element input : form.getElementsByTag("input")) {
            if (Objects.equals(input.id(), "username")) {
                parameters.add(new BasicNameValuePair(input.attr("name"), username));
            } else if (Objects.equals(input.id(), "password")) {
                parameters.add(new BasicNameValuePair(input.attr("name"), password));
            } else {
                parameters.add(new BasicNameValuePair(input.attr("name"), input.val()));
            }/*from   w ww .j  a va 2  s.c  o m*/

        if (isPost) {
            HttpPost res = new HttpPost(action);

            UrlEncodedFormEntity formEntity;
            try {
                formEntity = new UrlEncodedFormEntity(parameters, "UTF-8");
            } catch (UnsupportedEncodingException e) {
                throw new RuntimeException(e);

            return res;
        } else {
            UriBuilder b = UriBuilder.fromPath(action);
            for (NameValuePair parameter : parameters) {
                b.queryParam(parameter.getName(), parameter.getValue());
            return new HttpGet(b.build());

    throw new IllegalArgumentException("Invalid login form: " + loginPage);

From source file:com.nuance.expertassistant.ContentExtractor.java

public static void extract(Document doc) {

    final Elements links = doc.getElementsByTag("a");
    final Elements ps = doc.select("p");

    final String title = doc.title();

    print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(doc.title()) + "\">");

    final Elements elements = doc.select("*");

    final ArrayList<String> openHeaderList = new ArrayList<String>();

    for (final Element element : elements) {
        if (element.ownText() == null || element.ownText().isEmpty() || element.ownText().trim() == "") {

        } else if (element.tagName().toString().contains("a")) {

        } else if (element.tagName().contains("h1") && element.text() != null && !element.text().isEmpty()) {

            if (openHeaderList.contains("h1")) {
            }//from   w  w w  . jav a 2 s.co  m
            if (openHeaderList.contains("h2")) {
            if (openHeaderList.contains("h3")) {
            if (openHeaderList.contains("h4")) {

            print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">");

        } else if (element.tagName().contains("h2") && element.text() != null && !element.text().isEmpty()) {

            if (openHeaderList.contains("h2")) {
            if (openHeaderList.contains("h3")) {
            if (openHeaderList.contains("h4")) {

            print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">");

        } else if (element.tagName().contains("h3") && element.text() != null && !element.text().isEmpty()) {

            if (openHeaderList.contains("h3")) {
            if (openHeaderList.contains("h4")) {

            print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">");

        } else if (element.tagName().contains("h4") && element.text() != null && !element.text().isEmpty()) {

            if (openHeaderList.contains("h4")) {

            print("<section id =\"{}\" title =\"" + stripNonValidXMLCharacters(element.text()) + "\">");


        else {

         * if (element.tagName().contains("img")) { print("<img src=\"" +
         * element.attr("src") + "\"></img>"); }

    if (openHeaderList.contains("h1")) {
    if (openHeaderList.contains("h2")) {
    if (openHeaderList.contains("h3")) {
    if (openHeaderList.contains("h4")) {



From source file:net.sf.texprinter.utils.StringUtils.java

 * Escapes HTML entities and tags to a TeX format. This method tries to
 * replace HTML code by the TeX equivalent macros.
 * @param text The input text./*  w w  w  . j ava  2  s .c o m*/
 * @return A new text formatted from HTML to TeX.
public static String escapeHTMLtoTeX(String text) {

    // replace bold tags
    String newText = text.replaceAll("<b>", "\\\\textbf{");
    newText = newText.replaceAll("</b>", "}");

    // replace bold tags
    newText = newText.replaceAll("<strong>", "\\\\textbf{");
    newText = newText.replaceAll("</strong>", "}");

    // replace italic tags
    newText = newText.replaceAll("<i>", "\\\\textit{");
    newText = newText.replaceAll("</i>", "}");

    // replace emphasized tags
    newText = newText.replaceAll("<em>", "\\\\emph{");
    newText = newText.replaceAll("</em>", "}");

    // replace paragraphs tags
    newText = newText.replaceAll("<p>", "");
    newText = newText.replaceAll("</p>", "\n\n");

    // replace ordered lists tags
    newText = newText.replaceAll("<ol>", "\\\\begin{enumerate}\n");
    newText = newText.replaceAll("</ol>", "\\\\end{enumerate}\n");

    // replace unordered lists tags
    newText = newText.replaceAll("<ul>", "\\\\begin{itemize}\n");
    newText = newText.replaceAll("</ul>", "\\\\end{itemize}\n");

    // replace item tags
    newText = newText.replaceAll("<li>", "\\\\item ");
    newText = newText.replaceAll("</li>", "\n");

    // replace blockquote tags
    newText = newText.replaceAll("<blockquote>", "\\\\begin{quotation}\n");
    newText = newText.replaceAll("</blockquote>", "\\\\end{quotation}\n");

    // replace code tags
    newText = newText.replaceAll("<pre><code>", "\\\\begin{TeXPrinterListing}\n");
    newText = newText.replaceAll("<pre class=.*\"><code>", "\\\\begin{TeXPrinterListing}\n");
    newText = newText.replaceAll("</code></pre>", "\\\\end{TeXPrinterListing}\n\n");

    // replace inline code tags
    newText = newText.replaceAll("<code>", "\\\\lstinline|");
    newText = newText.replaceAll("</code>", "|");

    // replace links tags
    newText = newText.replaceAll("alt=\".*\" ", "");

    // parse the text
    Document docLinks = Jsoup.parse(newText);

    // get all the links
    Elements links = docLinks.getElementsByTag("a");

    // if there are links
    if (links.size() > 0) {

        // for every link
        for (Element link : links) {

            // get the outer HTML
            String temp = link.outerHtml();

            // replace it
            newText = newText.replaceFirst(Pattern.quote(temp),
                    "\\\\href{" + link.attr("href") + "}{" + link.text() + "}");


    // create a list of images
    ArrayList<ImageGroup> images = new ArrayList<ImageGroup>();

    // parse the current text
    Document doc = Jsoup.parse(text);

    // fetch all the media found
    Elements media = doc.select("[src]");

    // for all media found
    for (Element m : media) {

        // if it's an image tag
        if (m.tagName().equals("img")) {

            // create a new image group with the image link
            ImageGroup image = new ImageGroup(m.attr("abs:src"));

            // add to the list of images

            // set the current image to null
            image = null;

    // create a new loop saver
    LoopSaver lps = null;

    // for every image in the list of images
    for (ImageGroup img : images) {

        // create a new object
        lps = new LoopSaver();

        // while there are references for that image in the text
        while (newText.indexOf(img.getURL()) != -1) {

            // tick loop

            // replace the occurrence of that image
            newText = newText.replaceFirst("<img src=\"" + img.getURL() + "\" />",
                    "\\\\begin{figure}[h!]\n\\\\centering\n\\\\includegraphics[scale=0.5]{" + img.getName()
                            + "}\n\\\\end{figure}");

        // lets try
        try {

            // finally, download the image to the current directory
            Downloader.download(img.getURL(), img.getName());

        } catch (Exception exception) {

            // log message
                    "An error occurred while getting the current image. Trying to set the replacement image instead. MESSAGE: {0}",

            // image could not be downloaded for any reason
            try {

                // open a file stream
                FileOutputStream f = new FileOutputStream(img.getName());

                // write a replacement image

                // close the file

            } catch (IOException ioexception) {

                // log message
                        "An IO exception occured while trying to create the image replacement. MESSAGE: {0}",

            } catch (Exception except) {

                // log message
                        "An error occured while trying to create the image replacement. MESSAGE: {0}",




    // unescape all HTML entities
    newText = StringEscapeUtils.unescapeHtml(newText);

    // return new text
    return newText;

From source file:FILER.java

public static String[] Dealing_Files(File f) throws IOException //return array of important strings in the file
    Text = "";
    String[] Importants = { "", "", "", "" }; //first element is the title,second is all headers,third is img alt,4th is the url
    org.jsoup.nodes.Document doc = Jsoup.parse(f, "UTF-8");
    Importants[0] = doc.title(); //get the title of the file
    //Text=Text+" "+doc.title(); 
    String tag = "h";
    String All_Headers = "";
    Elements Header;/* w ww .jav  a  2 s .com*/
    for (int i = 1; i < 20; i++) //loop to get text with headers tag of the file
        tag = "h" + String.valueOf(i);
        Header = doc.select(tag);
        if (Header.size() > 0) {
            Header = doc.getElementsByTag(tag);
            String pConcatenated = "";
            for (Element x : Header) {
                pConcatenated += x.text() + " ";
            All_Headers = All_Headers + pConcatenated;
        } else

    Importants[1] = All_Headers;
    Text = Text + " " + doc.text(); //get the text of the document
    Elements img = doc.getElementsByTag("img"); //get the text with img tag 
    for (Element element : img) {
        if (element.attr("alt") != null && !(element.attr("alt").equals(""))) {
            Text = Text + " " + element.attr("alt");
            Importants[2] = Importants[2] + " " + element.attr("alt");
    return Importants;

From source file:index.IndexManager.java

public static Triple<SolrInputDocument, Collection<String>, Collection<String>> index(Document document) {
    final SolrInputDocument index = new SolrInputDocument();
    index.setField("id", document.location());
    index.setField("time", String.valueOf(System.currentTimeMillis()));
    index.setField("title", document.title());

    final Set<String> links = document.select("a[href]").stream().map(e -> e.attr("abs:href"))
    final Set<String> media = document.select("[src]").stream().map(e -> e.attr("abs:src"))

    links.forEach(link -> index.addField("link", link));
    media.forEach(link -> index.addField("media", link));

    formatText(document.getElementsByTag("h1").stream()).forEach(e -> index.addField("h1", e));

    formatText(document.getElementsByTag("h2").stream()).forEach(e -> index.addField("h2", e));

    formatText(document.getElementsByTag("h3").stream()).forEach(e -> index.addField("h3", e));

    formatText(document.getElementsByTag("strong").stream()).forEach(e -> index.addField("strong", e));

    formatText(document.getElementsByTag("em").stream()).forEach(e -> index.addField("em", e));

    formatText(document.getElementsByTag("b").stream()).forEach(e -> index.addField("b", e));

    formatText(document.getElementsByTag("u").stream()).forEach(e -> index.addField("u", e));

    formatText(document.getElementsByTag("i").stream()).forEach(e -> index.addField("i", e));

    int i = 0;/*from  w  w w.  ja  v  a2 s .co m*/
    Collection<String> text = chunkToLength(document.text());
    for (String chunk : text)
        index.addField(++i + "_text", chunk);

    return Triple.of(index, links, media);