Getting the Links in an HTML Document

Description

Demo Code

import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;

import javax.swing.text.BadLocationException;
import javax.swing.text.EditorKit;
import javax.swing.text.SimpleAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;

public class Main {

  public static String[] getLinks(String uriStr) {
    List result = new ArrayList();

    try {/*from  ww  w. jav a 2  s.  c  o  m*/
      // Create a reader on the HTML content
      URL url = new URI(uriStr).toURL();
      URLConnection conn = url.openConnection();
      Reader rd = new InputStreamReader(conn.getInputStream());

      // Parse the HTML
      EditorKit kit = new HTMLEditorKit();
      HTMLDocument doc = (HTMLDocument) kit.createDefaultDocument();
      kit.read(rd, doc, 0);

      // Find all the A elements in the HTML document
      HTMLDocument.Iterator it = doc.getIterator(HTML.Tag.A);
      while (it.isValid()) {
        SimpleAttributeSet s = (SimpleAttributeSet) it.getAttributes();

        String link = (String) s.getAttribute(HTML.Attribute.HREF);
        if (link != null) {
          // Add the link to the result list
          result.add(link);
        }
        it.next();
      }
    } catch (MalformedURLException e) {
    } catch (URISyntaxException e) {
    } catch (BadLocationException e) {
    } catch (IOException e) {
    }

    // Return all found links
    return (String[]) result.toArray(new String[result.size()]);
  }
}
Getting the Links in an HTML Document - Java Swing

Description

Demo Code

Related Tutorials