com.scaleunlimited.classify.BaseModel.java Source code

Java tutorial

Introduction

Here is the source code for com.scaleunlimited.classify.BaseModel.java

Source

/**
 * Copyright (c) 2009-2015 Scale Unlimited, Inc.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *     http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package com.scaleunlimited.classify;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;

import org.apache.hadoop.io.Writable;

import com.scaleunlimited.classify.datum.DocDatum;
import com.scaleunlimited.classify.datum.TermsDatum;

/**
 * Classification model, which can be trained using a set of pre-labeled
 * T documents, and which can then be used to classify
 * a set of unlabeled T documents, outputting the
 * classification of each as a {@link DocDatum} document.
 */
@SuppressWarnings("serial")
public abstract class BaseModel<T> implements Writable, Serializable {

    public static final String NOT_YET_LABELED = "";
    public static final float NOT_YET_SCORED = Float.NaN;

    protected BaseModel() {
    }

    /**
     * @param datum input document terms (with label) to help train model
     */
    abstract public void addTrainingTerms(T datum);

    /**
     * Use all training documents added via {@link #addTrainingTerms(T)}
     * since the last call to {@link #train()} to train the classification model
     * (i.e., replacing out any model definition from the previous call).
     */
    abstract public void train();

    /**
     * @param datum (unlabeled) input document terms to be classified
     * @return classification of input document terms
     */
    abstract public DocDatum classify(T datum);

    /**
     * @param datum (unlabeled) input document terms to be classified
     * @return top <n> results from classification
     */
    abstract public DocDatum[] classifyNResults(T datum, int n);

    /**
     * Generate details about the model.
     * 
     * @return Text description of the model.
     */
    abstract public String getDetails();

    /**
     * Reset model for another training run.
     */
    public void reset() {
        // Base does nothing.
    }

    /**
     * @param out stream into which string list should be serialized
     * @param string list to be serialized
     * @throws IOException
     */
    protected static void writeStrings(DataOutput out, List<String> strings) throws IOException {
        out.writeInt(strings.size());
        for (String string : strings) {
            out.writeUTF(string);
        }
    }

    /**
     * @param in stream from which string list should be deserialized
     * @return deserialized string list
     * @throws IOException
     */
    protected static List<String> readStrings(DataInput in) throws IOException {
        int numStrings = in.readInt();
        List<String> result = new ArrayList<String>(numStrings);
        for (int i = 0; i < numStrings; i++) {
            result.add(in.readUTF());
        }

        return result;
    }

}