Getting the Text in an HTML Document : HTML Parser « Network « Java Tutorial

import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URI;
import java.net.URL;
import java.net.URLConnection;

import javax.swing.text.EditorKit;
import javax.swing.text.html.HTMLDocument;
import javax.swing.text.html.HTMLEditorKit;

public class Main {
  public static void main(String[] argv) throws Exception {
    HTMLDocument doc = new HTMLDocument() {
      public HTMLEditorKit.ParserCallback getReader(int pos) {
        return new HTMLEditorKit.ParserCallback() {
          public void handleText(char[] data, int pos) {
            System.out.println(data);
          }
        };
      }
    };

    URL url = new URI("http://www.google.com").toURL();
    URLConnection conn = url.openConnection();
    Reader rd = new InputStreamReader(conn.getInputStream());

    EditorKit kit = new HTMLEditorKit();
    kit.read(rd, doc, 0);
  }
}

19.26.HTML Parser
	19.26.1.	Getting the Links in an HTML Document
	19.26.2.	Getting the Text in an HTML Document
	19.26.3.	Escape HTML special characters from a String
	19.26.4.	Using javax.swing.text.html.HTMLEditorKit to parse html document
	19.26.5.	Extract links from an HTML page
	19.26.6.	extends HTMLEditorKit.ParserCallback
	19.26.7.	HTML parser based on HTMLEditorKit.ParserCallback
	19.26.8.	Find and display hyperlinks contained within a web page
	19.26.9.	Get all hyper links from a web page
	19.26.10.	HTML Parser