LexicalProcessor.java :  » Parser » chaperon-3.0 » net » sourceforge » chaperon » process » Java Open Source

Java Open Source » Parser » chaperon 3.0 
chaperon 3.0 » net » sourceforge » chaperon » process » LexicalProcessor.java
/*
 *  Copyright (C) Chaperon. All rights reserved.
 *  -------------------------------------------------------------------------
 *  This software is published under the terms of the Apache Software License
 *  version 1.1, a copy of which has been included  with this distribution in
 *  the LICENSE file.
 */

package net.sourceforge.chaperon.process;

import net.sourceforge.chaperon.common.Decoder;

import org.apache.commons.logging.Log;

import org.xml.sax.*;
import org.xml.sax.ext.LexicalHandler;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.LocatorImpl;

/**
 * The processor convert a stream text into lexical tokens, like a tokenizer.
 *
 * @author <a href="mailto:stephan@apache.org">Stephan Michels </a>
 * @version CVS $Id: LexicalProcessor.java,v 1.22 2004/01/04 16:54:34 benedikta Exp $
 */
public class LexicalProcessor implements ContentHandler, LexicalHandler
{
  public static final String NS = "http://chaperon.sourceforge.net/schema/text/1.0";
  public static final String TEXT = "text";
  public static final String NS_OUTPUT = "http://chaperon.sourceforge.net/schema/lexer/2.0";
  public static final String OUTPUT = "output";
  public static final String LEXEME = "lexeme";
  public static final String GROUP = "group";
  public static final String ERROR = "error";
  private ContentHandler contentHandler = null;
  private LexicalHandler lexicalHandler = null;
  private static final int STATE_OUTSIDE = 0;
  private static final int STATE_TEXT = 1;
  private int state = STATE_OUTSIDE;
  private Locator locator = null;
  private LocatorImpl locatorImpl = null;
  private LexicalAutomaton automaton = null;
  private Log log = null;
  private boolean grouping = false;
  private boolean localizable = false;
  private String source;
  private int lineNumber;
  private int columnNumber;
  private StringBuffer buffer = null;
  private char[] text = null;

  /**
   * Create a new lexical processor.
   */
  public LexicalProcessor() {}

  /**
   * Create a new lexical processor.
   *
   * @param automaton Lexical automaton, which should be used.
   * @param handler Handler, which should receives the events.
   */
  public LexicalProcessor(LexicalAutomaton automaton)
  {
    this.automaton = automaton;
  }

  /**
   * Set the lexical automaton, which the processor should use.
   *
   * @param automaton Lexical automaton, which should be used.
   */
  public void setLexicalAutomaton(LexicalAutomaton automaton)
  {
    this.automaton = automaton;
  }

  /**
   * Set the <code>ContentHandler</code> that will receive XML data.
   */
  public void setContentHandler(ContentHandler handler)
  {
    this.contentHandler = handler;
  }

  /**
   * Set the <code>LexicalHandler</code> that will receive XML data.
   */
  public void setLexicalHandler(LexicalHandler handler)
  {
    this.lexicalHandler = handler;
  }

  /**
   * Set the log, which should be used.
   *
   * @param log Log.
   */
  public void setLog(Log log)
  {
    this.log = log;
  }

  public void setGrouping(boolean grouping)
  {
    this.grouping = grouping;
  }

  public void setLocalizable(boolean localizable)
  {
    this.localizable = localizable;
  }

  /**
   * Receive an object for locating the origin of SAX document events.
   */
  public void setDocumentLocator(Locator locator)
  {
    this.locator = locator;
    this.locatorImpl = null;
    if (locator!=null)
    {
      this.locatorImpl = new LocatorImpl(locator);
      contentHandler.setDocumentLocator(locatorImpl);
    }
  }

  /**
   * Receive notification of the beginning of a document.
   */
  public void startDocument() throws SAXException
  {
    if (locatorImpl!=null)
    {
      locatorImpl.setLineNumber(locator.getLineNumber());
      locatorImpl.setColumnNumber(locator.getColumnNumber());
    }

    contentHandler.startDocument();
    state = STATE_OUTSIDE;

    buffer = new StringBuffer();
  }

  /**
   * Receive notification of the beginning of an element.
   */
  public void startElement(String namespaceURI, String localName, String qName, Attributes atts)
    throws SAXException
  {
    if (state==STATE_OUTSIDE)
    {
      if ((namespaceURI!=null) && (namespaceURI.equals(NS)) && (localName.equals(TEXT)))
      {
        state = STATE_TEXT;
        buffer = new StringBuffer();

        if (atts.getValue("source")!=null)
          source = atts.getValue("source");
        else if (locator!=null)
          source = locator.getSystemId();
        else
          source = "unknown";

        if (atts.getValue("column")!=null)
          columnNumber = Integer.parseInt(atts.getValue("column"));
        else if (locator!=null)
          columnNumber = locator.getColumnNumber();
        else
          columnNumber = 1;

        if (atts.getValue("line")!=null)
          lineNumber = Integer.parseInt(atts.getValue("line"));
        else if (locator!=null)
          lineNumber = locator.getLineNumber();
        else
          lineNumber = 1;
      }
      else
        contentHandler.startElement(namespaceURI, localName, qName, atts);
    }
    else if (state==STATE_TEXT)
      throw new SAXException("Unexpected start element '"+qName+"'.");
  }

  /**
   * Receive notification of character data.
   */
  public void characters(char[] ch, int start, int length)
    throws SAXException
  {
    if (state==STATE_OUTSIDE)
      contentHandler.characters(ch, start, length);
    else if (state==STATE_TEXT)
      buffer.append(ch, start, length);
  }

  /**
   * Receive notification of ignorable whitespace in element content.
   */
  public void ignorableWhitespace(char[] ch, int start, int length)
    throws SAXException
  {
    if (state==STATE_OUTSIDE)
      contentHandler.characters(ch, start, length);
    else if (state==STATE_TEXT)
      buffer.append(ch, start, length);
  }

  /**
   * Receive notification of the end of an element.
   */
  public void endElement(String namespaceURI, String localName, String qName)
    throws SAXException
  {
    if (state==STATE_OUTSIDE)
      contentHandler.endElement(namespaceURI, localName, qName);
    else if (state==STATE_TEXT)
    {
      if ((namespaceURI!=null) && (namespaceURI.equals(NS)) && (localName.equals(TEXT)))
      {
        state = STATE_OUTSIDE;

        handleEndDocument();
      }
      else
        throw new SAXException("Unexpected end element '"+qName+"'.");
    }
  }

  /**
   * Begin the scope of a prefix-URI Namespace mapping.
   */
  public void startPrefixMapping(String prefix, String uri)
    throws SAXException
  {
    if (locatorImpl!=null)
    {
      locatorImpl.setLineNumber(locator.getLineNumber());
      locatorImpl.setColumnNumber(locator.getColumnNumber());
    }

    contentHandler.startPrefixMapping(prefix, uri);
  }

  /**
   * End the scope of a prefix-URI mapping.
   */
  public void endPrefixMapping(String prefix) throws SAXException
  {
    if (locatorImpl!=null)
    {
      locatorImpl.setLineNumber(locator.getLineNumber());
      locatorImpl.setColumnNumber(locator.getColumnNumber());
    }

    contentHandler.endPrefixMapping(prefix);
  }

  /**
   * Receive notification of a processing instruction.
   */
  public void processingInstruction(String target, String data)
    throws SAXException
  {
    if (locatorImpl!=null)
    {
      locatorImpl.setLineNumber(locator.getLineNumber());
      locatorImpl.setColumnNumber(locator.getColumnNumber());
    }

    if (state==STATE_OUTSIDE)
      contentHandler.processingInstruction(target, data);
  }

  /**
   * Receive notification of a skipped entity.
   */
  public void skippedEntity(String name) throws SAXException
  {
    if (locatorImpl!=null)
    {
      locatorImpl.setLineNumber(locator.getLineNumber());
      locatorImpl.setColumnNumber(locator.getColumnNumber());
    }

    if (state==STATE_OUTSIDE)
      contentHandler.skippedEntity(name);
  }

  /**
   * Receive notification of the end of a document.
   */
  public void endDocument() throws SAXException
  {
    if (locatorImpl!=null)
    {
      locatorImpl.setLineNumber(locator.getLineNumber());
      locatorImpl.setColumnNumber(locator.getColumnNumber());
    }

    if (state==STATE_OUTSIDE)
      contentHandler.endDocument();
  }

  /**
   * Report the start of DTD declarations, if any.
   */
  public void startDTD(String name, String publicId, String systemId)
    throws SAXException
  {
    if (lexicalHandler!=null)
      lexicalHandler.startDTD(name, publicId, systemId);
  }

  /**
   * Report the end of DTD declarations.
   */
  public void endDTD() throws SAXException
  {
    if (lexicalHandler!=null)
      lexicalHandler.endDTD();
  }

  /**
   * Report the beginning of an entity.
   */
  public void startEntity(String name) throws SAXException
  {
    if (lexicalHandler!=null)
      lexicalHandler.startEntity(name);
  }

  /**
   * Report the end of an entity.
   */
  public void endEntity(String name) throws SAXException
  {
    if (lexicalHandler!=null)
      lexicalHandler.endEntity(name);
  }

  /**
   * Report the start of a CDATA section.
   */
  public void startCDATA() throws SAXException
  {
    if (lexicalHandler!=null)
      lexicalHandler.startCDATA();
  }

  /**
   * Report the end of a CDATA section.
   */
  public void endCDATA() throws SAXException
  {
    if (lexicalHandler!=null)
      lexicalHandler.endCDATA();
  }

  /**
   * Report an XML comment anywhere in the document.
   */
  public void comment(char[] ch, int start, int len) throws SAXException
  {
    if (lexicalHandler!=null)
      lexicalHandler.comment(ch, start, len);
  }

  /**
   * Receives the notification, that the text  stream ended.
   */
  public void handleEndDocument() throws SAXException
  {
    PatternProcessor processor = new PatternProcessor();
    text = buffer.toString().toCharArray();

    int position = 0;

    if (locatorImpl!=null)
    {
      locatorImpl.setSystemId(source);
      locatorImpl.setLineNumber(lineNumber);
      locatorImpl.setColumnNumber(columnNumber);
    }

    contentHandler.startPrefixMapping("", NS_OUTPUT);

    AttributesImpl atts = new AttributesImpl();
    if (localizable)
      atts.addAttribute("", "source", "source", "CDATA", source);

    contentHandler.startElement(NS_OUTPUT, OUTPUT, OUTPUT, new AttributesImpl());

    StringBuffer unrecognized = new StringBuffer();
    while (position<text.length)
    {
      String tokensymbol = null;
      String tokentext = null;

      for (int lexemeindex = automaton.getLexemeCount()-1; lexemeindex>=0; lexemeindex--)
      {
        processor.setPatternAutomaton(automaton.getLexemeDefinition(lexemeindex));

        if ((processor.match(text, position)) &&
            ((tokentext==null) || (processor.getGroup().length()>=tokentext.length())))
        {
          tokensymbol = automaton.getLexemeSymbol(lexemeindex);
          tokentext = processor.getGroup();
        }
      }

      if ((tokentext!=null) && (tokentext.length()==0))
        log.warn("Lexical processor recognized empty lexeme '"+tokensymbol+"'");

      if ((tokentext!=null) && (tokentext.length()>0))
      {
        if (unrecognized.length()>0)
        {
          if (log!=null)
            log.debug("Text was not recognized "+Decoder.toString(unrecognized.toString()));

          atts = new AttributesImpl();
          atts.addAttribute("", "text", "text", "CDATA", unrecognized.toString());
          if (localizable)
          {
            atts.addAttribute("", "line", "line", "CDATA", String.valueOf(lineNumber));
            atts.addAttribute("", "column", "column", "CDATA", String.valueOf(columnNumber));
          }

          contentHandler.startElement(NS_OUTPUT, ERROR, ERROR, atts);
          contentHandler.endElement(NS_OUTPUT, ERROR, ERROR);

          increasePosition(position-unrecognized.length(), unrecognized.length());

          unrecognized = new StringBuffer();
        }

        if (tokensymbol!=null)
        {
          if (log!=null)
            log.debug("Recognize token "+tokensymbol+" with "+Decoder.toString(tokentext));

          if (locatorImpl!=null)
          {
            locatorImpl.setLineNumber(locator.getLineNumber());
            locatorImpl.setColumnNumber(locator.getColumnNumber());
          }

          atts = new AttributesImpl();

          atts.addAttribute("", "symbol", "symbol", "CDATA", tokensymbol);
          atts.addAttribute("", "text", "text", "CDATA", tokentext);
          if (localizable)
          {
            atts.addAttribute("", "line", "line", "CDATA", String.valueOf(lineNumber));
            atts.addAttribute("", "column", "column", "CDATA", String.valueOf(columnNumber));
          }

          contentHandler.startElement(NS_OUTPUT, LEXEME, LEXEME, atts);

          if (grouping)
            for (int i = 1; i<processor.getGroupCount(); i++)
            {
              AttributesImpl groupatts = new AttributesImpl();
              groupatts.addAttribute("", "text", "text", "CDATA", processor.getGroup(i));
              contentHandler.startElement(NS_OUTPUT, GROUP, GROUP, groupatts);
              contentHandler.endElement(NS_OUTPUT, GROUP, GROUP);
            }

          contentHandler.endElement(NS_OUTPUT, LEXEME, LEXEME);
        }
        else if (log!=null)
          log.debug("Ignore lexeme with "+Decoder.toString(tokentext));

        if (locatorImpl!=null)
        {
          locatorImpl.setColumnNumber(columnNumber);
          locatorImpl.setLineNumber(lineNumber);
        }

        position += tokentext.length();

        increasePosition(position-tokentext.length(), tokentext.length());
      }
      else
      {
        if (locatorImpl!=null)
        {
          locatorImpl.setColumnNumber(columnNumber);
          locatorImpl.setLineNumber(lineNumber);
        }

        unrecognized.append(text[position]);
        position++;
      }
    }

    if (unrecognized.length()>0)
    {
      if (log!=null)
        log.debug("Text was not recognized "+Decoder.toString(unrecognized.toString()));

      atts = new AttributesImpl();
      atts.addAttribute("", "text", "text", "CDATA", unrecognized.toString());
      if (localizable)
      {
        atts.addAttribute("", "line", "line", "CDATA", String.valueOf(lineNumber));
        atts.addAttribute("", "column", "column", "CDATA", String.valueOf(columnNumber));
      }

      contentHandler.startElement(NS_OUTPUT, ERROR, ERROR, atts);
      contentHandler.endElement(NS_OUTPUT, ERROR, ERROR);

      System.out.println("push \""+unrecognized.toString()+"\"");
      increasePosition(position-unrecognized.length(), unrecognized.length());
    }

    if (locatorImpl!=null)
    {
      locatorImpl.setLineNumber(locator.getLineNumber());
      locatorImpl.setColumnNumber(locator.getColumnNumber());
    }

    contentHandler.endElement(NS_OUTPUT, OUTPUT, OUTPUT);
    contentHandler.endPrefixMapping("");
  }

  private void increasePosition(int position, int length)
  {
    for (int i = position; i<(position+length); i++)
    {
      if (text[i]=='\n')
      {
        columnNumber = 1;
        lineNumber++;
      }
      else if ((text[i]=='\r') && ((i==(text.length-1)) || (text[i+1]!='\n')))
      {
        columnNumber = 1;
        lineNumber++;
      }
      else
        columnNumber++;
    }
  }
}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.