HTML Parser : HTML Parser « Network « Java Tutorial






/*******************************************************************************
 * Copyright (c) 2004 Actuate Corporation.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *  Actuate Corporation  - initial API and implementation
 *******************************************************************************/


import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.util.ArrayList;

public class HTMLParser
{
  FileReader reader;
  LineNumberReader in;
  String token;
  ArrayList attribs = new ArrayList( );
  int pushC = -1;
  private boolean ignoreWhitespace = true;
  
  public static final int EOF = -1;
  public static final int TEXT = 1;
  public static final int DOCTYPE = 2;
  public static final int ELEMENT = 3;
  public static final int COMMENT = 4;
  public static final int SPECIAL_ELEMENT = 5;
  
  public static final int START_ELEMENT = 0;
  public static final int END_ELEMENT = 1;
  public static final int SINGLE_ELEMENT = 2;
  
  public HTMLParser( )
  {
  }
  
  public void open( String fileName ) throws FileNotFoundException
  {
    reader = new FileReader( fileName );
    in = new LineNumberReader( reader );
  }
  
  /**
   * 
   */
  public void close( )
  {
    try
    {
      in.close( );
      reader.close( );
    }
    catch ( IOException e1 )
    {
      // Ignore
    }
  }

  public String getTokenText( )
  {
    return token;
  }
  
  public int getElementType( )
  {
    if ( token.startsWith( "/" ) ) //$NON-NLS-1$
      return END_ELEMENT;
    if ( token.endsWith( "/" ) ) //$NON-NLS-1$
      return SINGLE_ELEMENT;
    return START_ELEMENT;
  }
  
  public String getElement( )
  {
    if ( token.startsWith( "/" ) ) //$NON-NLS-1$
      return token.substring( 1 );
    if ( token.endsWith( "/" ) ) //$NON-NLS-1$
      return token.substring( 0, token.length( ) - 1 );
    return token;
    
  }
  
  public ArrayList getAttribs( )
  {
    return attribs;
  }
  
  public String getAttrib( String name )
  {
    for ( int i = 0;  i < attribs.size( );  i++ )
    {
      AttribPair a = (AttribPair) attribs.get( i );
      if ( a.attrib.equalsIgnoreCase( name ) )
        return a.value;
    }
    return null;
  }
  
  private int getC( )
  {
    if ( pushC != -1 )
    {
      int c = pushC;
      pushC = -1;
      return c;
    }
    try
    {
      return in.read( );
    }
    catch ( IOException e )
    {
      return EOF;
    }
  }
  
  private void pushC( int c )
  {
    pushC = c;
  }
  
  public int getToken( )
  {
    for ( ; ; )
    {
      int c = getC( );
      switch ( c )
      {
        case -1:
          return EOF;
        case '<':
          return getElement( c );
        default:
        {
          parseText( c );
          if ( ! ignoreWhitespace  ||  token.trim( ).length( ) > 0 )
            return TEXT;
        }
      }
    }
  }

  private int parseText( int c )
  {
    StringBuffer text = new StringBuffer( );
    for ( ; ; )
    {
      if ( c == EOF )
        break;
      if ( c == '<' )
      {
        pushC( c );
        break;
      }
      
      // Convert MS-Word-style quotes.
      
      if ( c == 8220  ||  c == 8221 )
        text.append( "&quot;" );
      else
        text.append( (char) c );
      c = getC( );
    }

    token = text.toString( );
    return TEXT;
  }

  private int skipSpace( int c )
  {
    while ( c != EOF  &&  Character.isWhitespace( (char)c ) )
    {
      c = getC( );
    }
    return c;
  }
  
  private int getElement( int c )
  {
    c = getC( );
    
    // Broken element
    
    if ( c == EOF )
      return EOF;
    
    if ( c == '!' )
      return getSpecialElement( );
    
    attribs.clear( );
    c = skipSpace( c );
    if ( c == EOF )
      return EOF;
    
    StringBuffer tag = new StringBuffer( );
    if ( c == '/' )
    {
      tag.append( (char) c );
      c = skipSpace( getC( ) );
      while ( c != EOF  &&  c != '>'  && ! Character.isWhitespace( (char)c ) )
      {
        tag.append( (char) c );
        c = getC( );
      }
      token = tag.toString( );
      for ( ; ; )
      {
        if ( c == '>'  ||  c == -1 )
          break;
        c = getC( );
      }
      return ELEMENT;     
    }
    
    while ( c != EOF  &&  c != '>'  &&  c != '/'  && ! Character.isWhitespace( (char)c ) )
    {
      tag.append( (char) c );
      c = getC( );
    }
    if ( c == EOF )
    {
      token = tag.toString( );
      return ELEMENT;
    }
    
    for ( ; ; )
    {
      c = skipSpace( c );
      if ( c == EOF  ||  c == '>' || c == '/' )
        break;
      c = getAttrib( c );
    }
    if ( c == '/' )
    {
      tag.append( (char) c );
      for ( ; ; )
      {
        c = getC( );
        if ( c == -1  ||  c == '>' )
          break;
      }
    }
    token = tag.toString( );
    return ELEMENT;
  }
  
  private int getAttrib( int c )
  {
    AttribPair a = new AttribPair( );
    StringBuffer s = new StringBuffer( );
    while ( c != EOF  &&  c != '='  &&  ! Character.isWhitespace( (char)c ) )
    {
      s.append( (char) c );
      c = getC( );
    }
    a.attrib = s.toString( );
    c = skipSpace( c );
    if ( c != '=' )
    {
      attribs.add( a );
      return c;
    }
    s = new StringBuffer( );
    c = skipSpace( getC( ) );
    if ( c == '\'' || c == '"' )
    {
      int quote = c;
      for ( ; ; )
      {
        c = getC( );
        if ( c == -1 )
          break;
        if ( c == quote )
        {
          c = getC( );
          break;
        }
        if ( c == '\\' )
        {
          c = getC( );
          if ( c == EOF )
            break;
          s.append( '\\' );
          s.append( (char) c );
        }
        else
        {
          s.append( (char) c );
        }
      }
    }
    else
    {
      for ( ; ; )
      {
        c = getC( );
        if ( c == -1 )
          break;
        if ( c == '>'  ||  c == '/'  ||  Character.isWhitespace( (char)c ) )
        {
          c = getC( );
          break;
        }
        s.append( (char) c );
      }
    }
    a.value = s.toString( );
    attribs.add( a );
    return c;
  }
  
  class AttribPair
  {
    String attrib;
    String value;
  }
  
  private int getSpecialElement(  )
  {
    StringBuffer text = new StringBuffer( );
    text.append( "<!" ); //$NON-NLS-1$
    for ( ; ; )
    {
      int c = getC( );
      if ( c == EOF || c == '>' )
        break;
      text.append( (char) c );
    }
    text.append( '>' );
    token = text.toString( );
    if ( token.startsWith( "<!--" ) ) //$NON-NLS-1$
      return COMMENT;
    return SPECIAL_ELEMENT;
  }

  static String formatTags[ ] =
  {
      "i", "b",  //$NON-NLS-1$//$NON-NLS-2$
      "strong", "em",  //$NON-NLS-1$//$NON-NLS-2$
      "code", "span", //$NON-NLS-1$ //$NON-NLS-2$
      "a" //$NON-NLS-1$
  };
  
  public boolean isFormatTag( )
  {
    return isFormatTag( getElement( ) );
  }
  
  public boolean isFormatTag( String tag )
  {
    for ( int i = 0;  i < formatTags.length;  i++ )
    {
      if ( formatTags[ i ].equalsIgnoreCase( tag ) )
        return true;
    }
    return false;
  }

  public Object getFullElement( )
  {
    StringBuffer text = new StringBuffer( );
    text.append( '<' );
    int elementType = getElementType( );
    if ( elementType == END_ELEMENT )
      text.append( '/' );
    text.append( getElement( ) );
    
    for ( int i = 0;  i < attribs.size( );  i++ )
    {
      text.append( ' ' );
      AttribPair a = (AttribPair) attribs.get( i );
      text.append( a.attrib );
      text.append( "=\"" ); //$NON-NLS-1$
      if ( a.value != null )
        text.append( a.value );
      text.append( "\"" ); //$NON-NLS-1$
    }
    if ( elementType == SINGLE_ELEMENT )
      text.append( '/' );
    text.append( '>' );
    return text.toString( );
  }

  public int getLineNo( )
  {
    return in.getLineNumber( );
  }

  public void ignoreWhitespace( boolean b )
  {
    ignoreWhitespace = b;
  }

}








19.26.HTML Parser
19.26.1.Getting the Links in an HTML Document
19.26.2.Getting the Text in an HTML Document
19.26.3.Escape HTML special characters from a String
19.26.4.Using javax.swing.text.html.HTMLEditorKit to parse html document
19.26.5.Extract links from an HTML page
19.26.6.extends HTMLEditorKit.ParserCallback
19.26.7.HTML parser based on HTMLEditorKit.ParserCallback
19.26.8.Find and display hyperlinks contained within a web page
19.26.9.Get all hyper links from a web page
19.26.10.HTML Parser