SentenceSplitterPR.java :  » Natural-Language-Processing » GATE » gate » lingpipe » Java Open Source

Java Open Source » Natural Language Processing » GATE 
GATE » gate » lingpipe » SentenceSplitterPR.java
package gate.lingpipe;

import gate.AnnotationSet;
import gate.FeatureMap;
import gate.ProcessingResource;
import gate.Resource;
import gate.creole.AbstractLanguageAnalyser;
import gate.creole.ExecutionException;
import gate.creole.ResourceInstantiationException;
import gate.util.InvalidOffsetException;

import java.util.Iterator;
import java.util.Set;

import com.aliasi.chunk.Chunk;
import com.aliasi.chunk.Chunking;
import com.aliasi.sentences.MedlineSentenceModel;
import com.aliasi.sentences.SentenceChunker;
import com.aliasi.sentences.SentenceModel;
import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import com.aliasi.tokenizer.TokenizerFactory;

/**
 * The Sentence splitter takes a document and find the sentence within.
 * @author Ekaterina Mihaylova
*/
public class SentenceSplitterPR extends AbstractLanguageAnalyser implements
    ProcessingResource {

  /**
   * Instance of the tokeniser
   */
  static final TokenizerFactory TOKENIZER_FACTORY = IndoEuropeanTokenizerFactory.INSTANCE;
  
  /**
   * Sentence model
   */
  static final SentenceModel SENTENCE_MODEL = new MedlineSentenceModel();
  
  /**
   * Sentence chunker
   */
  static final SentenceChunker SENTENCE_CHUNKER = new SentenceChunker(
      TOKENIZER_FACTORY, SENTENCE_MODEL);

  /**
   * Name of the annotation set
   */
  private String outputASName;

  /**
   * Gets name of the output annotation set where the Sentence annotations are stored
   * @return
   */
  public String getOutputASName() {
    return outputASName;
  }

  /**
   * Sets name of the output annotation set where the Sentence annotations are stored
   * @param outputAS
   */
  public void setOutputASName(String outputAS) {
    this.outputASName = outputAS;
  }

  /** Initialise this resource, and return it. */
  public Resource init() throws ResourceInstantiationException {
    return super.init();
  }

  /**
   * Reinitialises the processing resource. After calling this method the
   * resource should be in the state it is after calling init. If the resource
   * depends on external resources (such as rules files) then the resource
   * will re-read those resources. If the data used to create the resource has
   * changed since the resource has been created then the resource will change
   * too after calling reInit().
   */
  public void reInit() throws ResourceInstantiationException {
    init();
  }

  /**
   * This method runs the coreferencer. It assumes that all the needed
   * parameters are set. If they are not, an exception will be fired.
   */
  public void execute() throws ExecutionException {

    if (document == null) {
      throw new ExecutionException("The document can't be null");
    }

    AnnotationSet set = null;

    if (outputASName == null || outputASName.trim().length() == 0){
      set = document.getAnnotations();
    }else{
      set = document.getAnnotations(outputASName);
    }

    fireProgressChanged(0);

    String text = document.getContent().toString();

    Chunking chunking = SENTENCE_CHUNKER.chunk(text.toCharArray(), 0, text
        .length());
    Set sentences = chunking.chunkSet();
    if (sentences.size() < 1) {
      System.out.println("No sentence chunks found.");
      return;
    }

    FeatureMap map = gate.Factory.newFeatureMap();
    int i=1;
    for (Iterator it = sentences.iterator(); it.hasNext();i++) {
      Chunk sentence = (Chunk) it.next();
      int start = sentence.start();
      int end = sentence.end();
      try {
        set.add(new Long(start), new Long(end), "Sentence", map);
      } catch (InvalidOffsetException e) {
        throw new ExecutionException(e);
      }
      fireProgressChanged(100*i/sentences.size());
    }

    fireProcessFinished();
  }

}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.