POSEventCollector.java :  » Natural-Language-Processing » OpenNLP » opennlp » tools » postag » Java Open Source

Java Open Source » Natural Language Processing » OpenNLP 
OpenNLP » opennlp » tools » postag » POSEventCollector.java
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreemnets.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package opennlp.tools.postag;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;

import opennlp.model.Event;
import opennlp.model.EventCollector;
import opennlp.tools.util.Pair;

/**
 * An event generator for the maxent POS Tagger.
 *
 * @author      Gann Bierner
 * @version     $Revision: 1.2 $, $Date: 2009/01/24 01:32:19 $
 */
@Deprecated
public class POSEventCollector implements EventCollector {

  private BufferedReader br;
  private POSContextGenerator cg;

  /**
   * Initializes the current instance.
   *
   * @param data
   * @param gen
   */
  public POSEventCollector(Reader data, POSContextGenerator gen) {
    br = new BufferedReader(data);
    cg = gen;
  }

  private static Pair<String, String> split(String s) {
    int split = s.lastIndexOf("_");
    if (split == -1) {
      System.out.println("There is a problem in your training data: "
          + s
          + " does not conform to the format WORD_TAG.");
      return new Pair<String, String>(s, "UNKNOWN");
    }

    return new Pair<String, String>(s.substring(0, split), s.substring(split+1));
  }

  public static Pair<List<String>, List<String>> convertAnnotatedString(String s) {
    ArrayList<String> tokens = new ArrayList<String>();
    ArrayList<String> outcomes = new ArrayList<String>();
    StringTokenizer st = new StringTokenizer(s);
    while(st.hasMoreTokens()) {
      Pair<String, String> p = split(st.nextToken());
      tokens.add(p.a);
      outcomes.add(p.b);
    }
    return new Pair<List<String>, List<String>>(tokens, outcomes);
  }

  public Event[] getEvents() {
    return getEvents(false);
  }

  /**
   * Builds up the list of features using the Reader as input.  For now, this
   * should only be used to create training data.
   */
  public Event[] getEvents(boolean evalMode) {
    List<Event> elist = new ArrayList<Event>();
    try {
      String s = br.readLine();

      while (s != null) {
        Pair<List<String>, List<String>> p = convertAnnotatedString(s);
        List<String> tokens = p.a;
        List<String> outcomes = p.b;
        List<String> tags = new ArrayList<String>();

        for (int i=0; i<tokens.size(); i++) {
          String[] context = cg.getContext(i,tokens.toArray(new String[tokens.size()]),(String[]) tags.toArray(new String[tags.size()]),null);
          Event e = new Event((String)outcomes.get(i), context);
          tags.add(outcomes.get(i));
          elist.add(e);
        }
        s = br.readLine();
      }
    }
    catch (IOException e) {
      e.printStackTrace();
    }

    Event[] events = new Event[elist.size()];
    for(int i=0; i<events.length; i++)
      events[i] = (Event)elist.get(i);

    return events;
  }
}
java2s.com  | Contact Us | Privacy Policy
Copyright 2009 - 12 Demo Source and Support. All rights reserved.
All other trademarks are property of their respective owners.