HTML Parser : HTML Output « Servlets « Java






HTML Parser

 
/*******************************************************************************
 * Copyright (c) 2004 Actuate Corporation.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 *  Actuate Corporation  - initial API and implementation
 *******************************************************************************/


import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.util.ArrayList;

public class HTMLParser
{
  FileReader reader;
  LineNumberReader in;
  String token;
  ArrayList attribs = new ArrayList( );
  int pushC = -1;
  private boolean ignoreWhitespace = true;
  
  public static final int EOF = -1;
  public static final int TEXT = 1;
  public static final int DOCTYPE = 2;
  public static final int ELEMENT = 3;
  public static final int COMMENT = 4;
  public static final int SPECIAL_ELEMENT = 5;
  
  public static final int START_ELEMENT = 0;
  public static final int END_ELEMENT = 1;
  public static final int SINGLE_ELEMENT = 2;
  
  public HTMLParser( )
  {
  }
  
  public void open( String fileName ) throws FileNotFoundException
  {
    reader = new FileReader( fileName );
    in = new LineNumberReader( reader );
  }
  
  /**
   * 
   */
  public void close( )
  {
    try
    {
      in.close( );
      reader.close( );
    }
    catch ( IOException e1 )
    {
      // Ignore
    }
  }

  public String getTokenText( )
  {
    return token;
  }
  
  public int getElementType( )
  {
    if ( token.startsWith( "/" ) ) //$NON-NLS-1$
      return END_ELEMENT;
    if ( token.endsWith( "/" ) ) //$NON-NLS-1$
      return SINGLE_ELEMENT;
    return START_ELEMENT;
  }
  
  public String getElement( )
  {
    if ( token.startsWith( "/" ) ) //$NON-NLS-1$
      return token.substring( 1 );
    if ( token.endsWith( "/" ) ) //$NON-NLS-1$
      return token.substring( 0, token.length( ) - 1 );
    return token;
    
  }
  
  public ArrayList getAttribs( )
  {
    return attribs;
  }
  
  public String getAttrib( String name )
  {
    for ( int i = 0;  i < attribs.size( );  i++ )
    {
      AttribPair a = (AttribPair) attribs.get( i );
      if ( a.attrib.equalsIgnoreCase( name ) )
        return a.value;
    }
    return null;
  }
  
  private int getC( )
  {
    if ( pushC != -1 )
    {
      int c = pushC;
      pushC = -1;
      return c;
    }
    try
    {
      return in.read( );
    }
    catch ( IOException e )
    {
      return EOF;
    }
  }
  
  private void pushC( int c )
  {
    pushC = c;
  }
  
  public int getToken( )
  {
    for ( ; ; )
    {
      int c = getC( );
      switch ( c )
      {
        case -1:
          return EOF;
        case '<':
          return getElement( c );
        default:
        {
          parseText( c );
          if ( ! ignoreWhitespace  ||  token.trim( ).length( ) > 0 )
            return TEXT;
        }
      }
    }
  }

  private int parseText( int c )
  {
    StringBuffer text = new StringBuffer( );
    for ( ; ; )
    {
      if ( c == EOF )
        break;
      if ( c == '<' )
      {
        pushC( c );
        break;
      }
      
      // Convert MS-Word-style quotes.
      
      if ( c == 8220  ||  c == 8221 )
        text.append( "&quot;" );
      else
        text.append( (char) c );
      c = getC( );
    }

    token = text.toString( );
    return TEXT;
  }

  private int skipSpace( int c )
  {
    while ( c != EOF  &&  Character.isWhitespace( (char)c ) )
    {
      c = getC( );
    }
    return c;
  }
  
  private int getElement( int c )
  {
    c = getC( );
    
    // Broken element
    
    if ( c == EOF )
      return EOF;
    
    if ( c == '!' )
      return getSpecialElement( );
    
    attribs.clear( );
    c = skipSpace( c );
    if ( c == EOF )
      return EOF;
    
    StringBuffer tag = new StringBuffer( );
    if ( c == '/' )
    {
      tag.append( (char) c );
      c = skipSpace( getC( ) );
      while ( c != EOF  &&  c != '>'  && ! Character.isWhitespace( (char)c ) )
      {
        tag.append( (char) c );
        c = getC( );
      }
      token = tag.toString( );
      for ( ; ; )
      {
        if ( c == '>'  ||  c == -1 )
          break;
        c = getC( );
      }
      return ELEMENT;     
    }
    
    while ( c != EOF  &&  c != '>'  &&  c != '/'  && ! Character.isWhitespace( (char)c ) )
    {
      tag.append( (char) c );
      c = getC( );
    }
    if ( c == EOF )
    {
      token = tag.toString( );
      return ELEMENT;
    }
    
    for ( ; ; )
    {
      c = skipSpace( c );
      if ( c == EOF  ||  c == '>' || c == '/' )
        break;
      c = getAttrib( c );
    }
    if ( c == '/' )
    {
      tag.append( (char) c );
      for ( ; ; )
      {
        c = getC( );
        if ( c == -1  ||  c == '>' )
          break;
      }
    }
    token = tag.toString( );
    return ELEMENT;
  }
  
  private int getAttrib( int c )
  {
    AttribPair a = new AttribPair( );
    StringBuffer s = new StringBuffer( );
    while ( c != EOF  &&  c != '='  &&  ! Character.isWhitespace( (char)c ) )
    {
      s.append( (char) c );
      c = getC( );
    }
    a.attrib = s.toString( );
    c = skipSpace( c );
    if ( c != '=' )
    {
      attribs.add( a );
      return c;
    }
    s = new StringBuffer( );
    c = skipSpace( getC( ) );
    if ( c == '\'' || c == '"' )
    {
      int quote = c;
      for ( ; ; )
      {
        c = getC( );
        if ( c == -1 )
          break;
        if ( c == quote )
        {
          c = getC( );
          break;
        }
        if ( c == '\\' )
        {
          c = getC( );
          if ( c == EOF )
            break;
          s.append( '\\' );
          s.append( (char) c );
        }
        else
        {
          s.append( (char) c );
        }
      }
    }
    else
    {
      for ( ; ; )
      {
        c = getC( );
        if ( c == -1 )
          break;
        if ( c == '>'  ||  c == '/'  ||  Character.isWhitespace( (char)c ) )
        {
          c = getC( );
          break;
        }
        s.append( (char) c );
      }
    }
    a.value = s.toString( );
    attribs.add( a );
    return c;
  }
  
  class AttribPair
  {
    String attrib;
    String value;
  }
  
  private int getSpecialElement(  )
  {
    StringBuffer text = new StringBuffer( );
    text.append( "<!" ); //$NON-NLS-1$
    for ( ; ; )
    {
      int c = getC( );
      if ( c == EOF || c == '>' )
        break;
      text.append( (char) c );
    }
    text.append( '>' );
    token = text.toString( );
    if ( token.startsWith( "<!--" ) ) //$NON-NLS-1$
      return COMMENT;
    return SPECIAL_ELEMENT;
  }

  static String formatTags[ ] =
  {
      "i", "b",  //$NON-NLS-1$//$NON-NLS-2$
      "strong", "em",  //$NON-NLS-1$//$NON-NLS-2$
      "code", "span", //$NON-NLS-1$ //$NON-NLS-2$
      "a" //$NON-NLS-1$
  };
  
  public boolean isFormatTag( )
  {
    return isFormatTag( getElement( ) );
  }
  
  public boolean isFormatTag( String tag )
  {
    for ( int i = 0;  i < formatTags.length;  i++ )
    {
      if ( formatTags[ i ].equalsIgnoreCase( tag ) )
        return true;
    }
    return false;
  }

  public Object getFullElement( )
  {
    StringBuffer text = new StringBuffer( );
    text.append( '<' );
    int elementType = getElementType( );
    if ( elementType == END_ELEMENT )
      text.append( '/' );
    text.append( getElement( ) );
    
    for ( int i = 0;  i < attribs.size( );  i++ )
    {
      text.append( ' ' );
      AttribPair a = (AttribPair) attribs.get( i );
      text.append( a.attrib );
      text.append( "=\"" ); //$NON-NLS-1$
      if ( a.value != null )
        text.append( a.value );
      text.append( "\"" ); //$NON-NLS-1$
    }
    if ( elementType == SINGLE_ELEMENT )
      text.append( '/' );
    text.append( '>' );
    return text.toString( );
  }

  public int getLineNo( )
  {
    return in.getLineNumber( );
  }

  public void ignoreWhitespace( boolean b )
  {
    ignoreWhitespace = b;
  }

}

   
  








Related examples in the same category

1.Servlet Output HTML Demo
2.Servlet Display Static HTML
3.Prints a conversion table of miles per gallon to kilometers per liter
4.Servlet: Print Table
5.Html utilities
6.Html Parse Servlet
7.Escape and unescape string
8.Escapes newlines, tabs, backslashes, and quotes in the specified string
9.Web Calendar
10.HTML Helper
11.Escape HTML
12.Convert HTML to text
13.Text To HTML
14.Unescape HTML
15.Java object representations of the HTML table structure
16.Entity Decoder
17.Format a color to HTML RGB color format (e.g. #FF0000 for Color.red)
18.Definitions of HTML character entities and conversions between unicode characters and HTML character entities
19.Encode special characters and do formatting for HTML output
20.HTML color names
21.Utility methods for dealing with HTML
22.Filter the specified message string for characters that are sensitive in HTML
23.A collection of all character entites defined in the HTML4 standard.
24.Decode an HTML color string like '#F567BA;' into a Color
25.Normalize Post Data
26.Get HTML Color String from Java Color object
27.HTML Decoder
28.HTML color and Java Color
29.HTML form Utilites
30.Html Dimensions
31.break Lines with HTML
32.insert HTML block dynamically
33.Convert an integer to an HTML RGB value
34.Convert to HTML string