Example usage for opennlp.tools.doccat BagOfWordsFeatureGenerator BagOfWordsFeatureGenerator

List of usage examples for opennlp.tools.doccat BagOfWordsFeatureGenerator BagOfWordsFeatureGenerator

Introduction

In this page you can find the example usage for opennlp.tools.doccat BagOfWordsFeatureGenerator BagOfWordsFeatureGenerator.

Prototype

public BagOfWordsFeatureGenerator() 

Source Link

Usage

From source file:com.tamingtext.classifier.maxent.TrainMaxent.java

public void train(String source, String destination) throws IOException {
    //<start id="maxent.examples.train.setup"/> 
    File[] inputFiles = FileUtil.buildFileList(new File(source));
    File modelFile = new File(destination);

    Tokenizer tokenizer = SimpleTokenizer.INSTANCE; //<co id="tm.tok"/>
    CategoryDataStream ds = new CategoryDataStream(inputFiles, tokenizer);

    int cutoff = 5;
    int iterations = 100;
    NameFinderFeatureGenerator nffg //<co id="tm.fg"/>
            = new NameFinderFeatureGenerator();
    BagOfWordsFeatureGenerator bowfg = new BagOfWordsFeatureGenerator();

    DoccatModel model = DocumentCategorizerME.train("en", ds, cutoff, iterations, nffg, bowfg); //<co id="tm.train"/>
    model.serialize(new FileOutputStream(modelFile));

    /*<calloutlist>
    <callout arearefs="tm.tok">Create data stream</callout>
    <callout arearefs="tm.fg">Set up features generators</callout> 
    <callout arearefs="tm.train">Train categorizer</callout>  
    </calloutlist>*//*from   w w w.ja  v  a2  s.  co m*/
    //<end id="maxent.examples.train.setup"/>
}

From source file:com.tamingtext.classifier.maxent.TestMaxent.java

private static void execute(File[] inputFiles, File modelFile) throws IOException, FileNotFoundException {
    //<start id="maxent.examples.test.setup"/> 
    NameFinderFeatureGenerator nffg //<co id="tmx.feature"/>
            = new NameFinderFeatureGenerator();
    BagOfWordsFeatureGenerator bowfg = new BagOfWordsFeatureGenerator();

    InputStream modelStream = //<co id="tmx.modelreader"/>
            new FileInputStream(modelFile);
    DoccatModel model = new DoccatModel(modelStream);
    DocumentCategorizer categorizer //<co id="tmx.categorizer"/>
            = new DocumentCategorizerME(model, nffg, bowfg);
    Tokenizer tokenizer = SimpleTokenizer.INSTANCE;

    int catCount = categorizer.getNumberOfCategories();
    Collection<String> categories = new ArrayList<String>(catCount);
    for (int i = 0; i < catCount; i++) {
        categories.add(categorizer.getCategory(i));
    }// w w w .jav a  2s .  c o  m
    ResultAnalyzer resultAnalyzer = //<co id="tmx.results"/>
            new ResultAnalyzer(categories, "unknown");
    runTest(inputFiles, categorizer, tokenizer, resultAnalyzer); //<co id="tmx.run"/>
    /*<calloutlist>
    <callout arearefs="tmx.feature">Setup Feature Generators</callout>
    <callout arearefs="tmx.modelreader">Load Model</callout>
    <callout arearefs="tmx.categorizer">Create Categorizer</callout>
    <callout arearefs="tmx.results">Prepare Result Analyzer</callout>
    <callout arearefs="tmx.run">Execute Test</callout>
    </calloutlist>*/
    //<end id="maxent.examples.test.setup"/>
}