Example usage for org.apache.mahout.classifier ResultAnalyzer addInstance

List of usage examples for org.apache.mahout.classifier ResultAnalyzer addInstance

Introduction

In this page you can find the example usage for org.apache.mahout.classifier ResultAnalyzer addInstance.

Prototype

public boolean addInstance(String correctLabel, ClassifierResult classifiedResult) 

Source Link

Usage

From source file:com.memonews.mahout.sentiment.SentimentModelTester.java

License:Apache License

public void run(final PrintWriter output) throws IOException {

    final File base = new File(inputFile);
    // contains the best model
    final OnlineLogisticRegression classifier = ModelSerializer.readBinary(new FileInputStream(modelFile),
            OnlineLogisticRegression.class);

    final Dictionary newsGroups = new Dictionary();
    final Multiset<String> overallCounts = HashMultiset.create();

    final List<File> files = Lists.newArrayList();
    for (final File newsgroup : base.listFiles()) {
        if (newsgroup.isDirectory()) {
            newsGroups.intern(newsgroup.getName());
            files.addAll(Arrays.asList(newsgroup.listFiles()));
        }/*from ww w .ja  v  a  2s.c  o m*/
    }
    System.out.printf("%d test files\n", files.size());
    final ResultAnalyzer ra = new ResultAnalyzer(newsGroups.values(), "DEFAULT");
    for (final File file : files) {
        final String ng = file.getParentFile().getName();

        final int actual = newsGroups.intern(ng);
        final SentimentModelHelper helper = new SentimentModelHelper();
        final Vector input = helper.encodeFeatureVector(file, overallCounts);// no
        // leak
        // type
        // ensures
        // this
        // is
        // a
        // normal
        // vector
        final Vector result = classifier.classifyFull(input);
        final int cat = result.maxValueIndex();
        final double score = result.maxValue();
        final double ll = classifier.logLikelihood(actual, input);
        final ClassifierResult cr = new ClassifierResult(newsGroups.values().get(cat), score, ll);
        ra.addInstance(newsGroups.values().get(actual), cr);

    }
    output.printf("%s\n\n", ra.toString());
}

From source file:com.missionsky.scp.dataanalysis.mahout.TestNaiveBayesDriver.java

License:Apache License

private static void analyzeResults(Map<Integer, String> labelMap,
        SequenceFileDirIterable<Text, VectorWritable> dirIterable, ResultAnalyzer analyzer) {
    for (Pair<Text, VectorWritable> pair : dirIterable) {
        int bestIdx = Integer.MIN_VALUE;
        double bestScore = Long.MIN_VALUE;
        for (Vector.Element element : pair.getSecond().get().all()) {
            if (element.get() > bestScore) {
                bestScore = element.get();
                bestIdx = element.index();
            }//from w  ww  . j  a v  a2 s . com
        }
        if (bestIdx != Integer.MIN_VALUE) {
            ClassifierResult classifierResult = new ClassifierResult(labelMap.get(bestIdx), bestScore);
            analyzer.addInstance(pair.getFirst().toString(), classifierResult);
        }
    }
}

From source file:com.tamingtext.classifier.maxent.TestMaxent.java

License:Apache License

private static void runTest(File[] inputFiles, DocumentCategorizer categorizer, Tokenizer tokenizer,
        ResultAnalyzer resultAnalyzer) throws FileNotFoundException, IOException {
    String line;//ww w. j ava  2 s.  c o  m
    //<start id="maxent.examples.test.execute"/>
    for (File ff : inputFiles) {
        BufferedReader in = new BufferedReader(new FileReader(ff));
        while ((line = in.readLine()) != null) {
            String[] parts = line.split("\t");
            if (parts.length != 2)
                continue;

            String docText = parts[1]; //<co id="tmt.preprocess"/>
            String[] tokens = tokenizer.tokenize(docText);

            double[] probs = categorizer.categorize(tokens); //<co id="tmt.categorize"/>
            String label = categorizer.getBestCategory(probs);
            int bestIndex = categorizer.getIndex(label);
            double score = probs[bestIndex];

            ClassifierResult result //<co id="tmt.collect"/>
                    = new ClassifierResult(label, score);
            resultAnalyzer.addInstance(parts[0], result);
        }
        in.close();
    }

    System.err.println(resultAnalyzer.toString()); //<co id="tmt.summarize"/>
    /*<calloutlist>
     * <callout arearefs="tmt.preprocess">Preprocess text</callout>
     * <callout arearefs="tmt.categorize">Categorize</callout>
     * <callout arearefs="tmt.collect">Analyze Results</callout>
     * <callout arearefs="tmt.summarize">Present Results</callout>
     * </calloutlist>*/
    //<end id="maxent.examples.test.execute"/>
}

From source file:com.tamingtext.classifier.mlt.TestMoreLikeThis.java

License:Apache License

public static void main(String[] args) throws Exception {
    DefaultOptionBuilder obuilder = new DefaultOptionBuilder();
    ArgumentBuilder abuilder = new ArgumentBuilder();
    GroupBuilder gbuilder = new GroupBuilder();

    Option helpOpt = DefaultOptionCreator.helpOption();

    Option inputDirOpt = obuilder.withLongName("input").withRequired(true)
            .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create())
            .withDescription("The input directory").withShortName("i").create();

    Option modelOpt = obuilder.withLongName("model").withRequired(true)
            .withArgument(abuilder.withName("index").withMinimum(1).withMaximum(1).create())
            .withDescription("The directory containing the index model").withShortName("m").create();

    Option categoryFieldOpt = obuilder.withLongName("categoryField").withRequired(true)
            .withArgument(abuilder.withName("index").withMinimum(1).withMaximum(1).create())
            .withDescription("Name of the field containing category information").withShortName("catf")
            .create();//from  w  ww  .j a v  a 2s . co m

    Option contentFieldOpt = obuilder.withLongName("contentField").withRequired(true)
            .withArgument(abuilder.withName("index").withMinimum(1).withMaximum(1).create())
            .withDescription("Name of the field containing content information").withShortName("contf")
            .create();

    Option maxResultsOpt = obuilder.withLongName("maxResults").withRequired(false)
            .withArgument(abuilder.withName("gramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("Number of results to retrive, default: 10 ").withShortName("r").create();

    Option gramSizeOpt = obuilder.withLongName("gramSize").withRequired(false)
            .withArgument(abuilder.withName("gramSize").withMinimum(1).withMaximum(1).create())
            .withDescription("Size of the n-gram. Default Value: 1 ").withShortName("ng").create();

    Option typeOpt = obuilder.withLongName("classifierType").withRequired(false)
            .withArgument(abuilder.withName("classifierType").withMinimum(1).withMaximum(1).create())
            .withDescription("Type of classifier: knn|tfidf. Default: bayes").withShortName("type").create();

    Group group = gbuilder.withName("Options").withOption(gramSizeOpt).withOption(helpOpt)
            .withOption(inputDirOpt).withOption(modelOpt).withOption(typeOpt).withOption(contentFieldOpt)
            .withOption(categoryFieldOpt).withOption(maxResultsOpt).create();

    try {
        Parser parser = new Parser();

        parser.setGroup(group);
        parser.setHelpOption(helpOpt);
        CommandLine cmdLine = parser.parse(args);
        if (cmdLine.hasOption(helpOpt)) {
            CommandLineUtil.printHelp(group);
            return;
        }

        String classifierType = (String) cmdLine.getValue(typeOpt);

        int gramSize = 1;
        if (cmdLine.hasOption(gramSizeOpt)) {
            gramSize = Integer.parseInt((String) cmdLine.getValue(gramSizeOpt));
        }

        int maxResults = 10;
        if (cmdLine.hasOption(maxResultsOpt)) {
            maxResults = Integer.parseInt((String) cmdLine.getValue(maxResultsOpt));
        }

        String inputPath = (String) cmdLine.getValue(inputDirOpt);
        String modelPath = (String) cmdLine.getValue(modelOpt);
        String categoryField = (String) cmdLine.getValue(categoryFieldOpt);
        String contentField = (String) cmdLine.getValue(contentFieldOpt);

        MatchMode mode;

        if ("knn".equalsIgnoreCase(classifierType)) {
            mode = MatchMode.KNN;
        } else if ("tfidf".equalsIgnoreCase(classifierType)) {
            mode = MatchMode.TFIDF;
        } else {
            throw new IllegalArgumentException("Unkown classifierType: " + classifierType);
        }

        Directory directory = FSDirectory.open(new File(modelPath));
        IndexReader indexReader = IndexReader.open(directory);
        Analyzer analyzer //<co id="mlt.analyzersetup"/>
                = new EnglishAnalyzer(Version.LUCENE_36);

        MoreLikeThisCategorizer categorizer = new MoreLikeThisCategorizer(indexReader, categoryField);
        categorizer.setAnalyzer(analyzer);
        categorizer.setMatchMode(mode);
        categorizer.setFieldNames(new String[] { contentField });
        categorizer.setMaxResults(maxResults);
        categorizer.setNgramSize(gramSize);

        File f = new File(inputPath);
        if (!f.isDirectory()) {
            throw new IllegalArgumentException(f + " is not a directory or does not exit");
        }

        File[] inputFiles = FileUtil.buildFileList(f);

        String line = null;
        //<start id="lucene.examples.mlt.test"/>
        final ClassifierResult UNKNOWN = new ClassifierResult("unknown", 1.0);

        ResultAnalyzer resultAnalyzer = //<co id="co.mlt.ra"/>
                new ResultAnalyzer(categorizer.getCategories(), UNKNOWN.getLabel());

        for (File ff : inputFiles) { //<co id="co.mlt.read"/>
            BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(ff), "UTF-8"));
            while ((line = in.readLine()) != null) {
                String[] parts = line.split("\t");
                if (parts.length != 2) {
                    continue;
                }

                CategoryHits[] hits //<co id="co.mlt.cat"/>
                        = categorizer.categorize(new StringReader(parts[1]));
                ClassifierResult result = hits.length > 0 ? hits[0] : UNKNOWN;
                resultAnalyzer.addInstance(parts[0], result); //<co id="co.mlt.an"/>
            }

            in.close();
        }

        System.out.println(resultAnalyzer.toString());//<co id="co.mlt.print"/>
        /*
        <calloutlist>
          <callout arearefs="co.mlt.ra">Create <classname>ResultAnalyzer</classname></callout>
          <callout arearefs="co.mlt.read">Read Test data</callout>
          <callout arearefs="co.mlt.cat">Categorize</callout>
          <callout arearefs="co.mlt.an">Collect Results</callout>
          <callout arearefs="co.mlt.print">Display Results</callout>
        </calloutlist>
        */
        //<end id="lucene.examples.mlt.test"/>
    } catch (OptionException e) {
        log.error("Error while parsing options", e);
    }
}

From source file:com.wsc.myexample.decisionForest.MyTestForest.java

License:Apache License

private void testFile(String inPath, String outPath, DataConverter converter, MyDecisionForest forest,
        Dataset dataset, /*List<double[]> results,*/ Random rng, ResultAnalyzer analyzer) throws IOException {
    // create the predictions file
    DataOutputStream ofile = null;

    if (outPath != null) {
        ofile = new DataOutputStream(new FileOutputStream(outPath));
    }/*ww w  .ja va2s  .co m*/

    DataInputStream input = new DataInputStream(new FileInputStream(inPath));
    try {
        Scanner scanner = new Scanner(input);

        while (scanner.hasNextLine()) {
            String line = scanner.nextLine();
            if (line.isEmpty()) {
                continue; // skip empty lines
            }

            Instance instance = converter.convert(line);
            if (instance == null)
                continue;

            double prediction = forest.classify(dataset, rng, instance);

            if (ofile != null) {
                ofile.writeChars(Double.toString(prediction)); // write the prediction
                ofile.writeChar('\n');
            }

            //        results.add(new double[] {dataset.getLabel(instance), prediction});

            analyzer.addInstance(dataset.getLabelString(dataset.getLabel(instance)),
                    new ClassifierResult(dataset.getLabelString(prediction), 1.0));
        }

        scanner.close();
    } finally {
        Closeables.closeQuietly(input);
        ofile.close();
    }
}

From source file:guipart.view.GUIOverviewController.java

@FXML
void handleClassifyRF(ActionEvent event) throws IOException {

    String outputFile = "data/out";

    Path dataPath = new Path(textFieldCSVRF.getText()); // test data path
    Path datasetPath = new Path(textFieldDatasetRF.getText()); //info file about data set
    Path modelPath = new Path(textFieldModelRF.getText()); // path where the forest is stored
    Path outputPath = new Path(outputFile); // path to predictions file, if null do not output the predictions

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    FileSystem outFS = FileSystem.get(conf);

    System.out.println("Loading the forest");
    DecisionForest forest = DecisionForest.load(conf, modelPath);

    if (forest == null)
        System.err.println("No decision forest found!");

    // load the dataset
    Dataset dataset = Dataset.load(conf, datasetPath);
    DataConverter converter = new DataConverter(dataset);

    System.out.println("Sequential classification");
    long time = System.currentTimeMillis();

    Random rng = RandomUtils.getRandom();

    List<double[]> resList = Lists.newArrayList();
    if (fs.getFileStatus(dataPath).isDir()) {
        //the input is a directory of files
        Utils.rfTestDirectory(outputPath, converter, forest, dataset, resList, rng, fs, dataPath, outFS,
                guiPart);//from  w  w  w.j a  va  2 s.  co m
    } else {
        // the input is one single file
        Utils.rfTestFile(dataPath, outputPath, converter, forest, dataset, resList, rng, outFS, fs, guiPart);
    }

    time = System.currentTimeMillis() - time;
    //log.info("Classification Time: {}", DFUtils.elapsedTime(time));
    System.out.println("Classification time: " + DFUtils.elapsedTime(time));

    if (dataset.isNumerical(dataset.getLabelId())) {

        RegressionResultAnalyzer regressionAnalyzer = new RegressionResultAnalyzer();
        double[][] results = new double[resList.size()][2];
        regressionAnalyzer.setInstances(resList.toArray(results));
        //log.info("{}", regressionAnalyzer);
        System.out.println(regressionAnalyzer.toString());

    } else {
        ResultAnalyzer analyzer = new ResultAnalyzer(Arrays.asList(dataset.labels()), "unknown");
        for (double[] r : resList) {
            analyzer.addInstance(dataset.getLabelString(r[0]),
                    new ClassifierResult(dataset.getLabelString(r[1]), 1.0));
        }
        //log.info("{}", analyzer);
        System.out.println(analyzer.toString());
        textAnalyze.setText(analyzer.toString());
    }

}

From source file:imageClassify.TestForest.java

License:Apache License

private void mapreduce() throws ClassNotFoundException, IOException, InterruptedException {
    if (outputPath == null) {
        throw new IllegalArgumentException(
                "You must specify the ouputPath when using the mapreduce implementation");
    }/*  w ww .  jav  a2s .  c  om*/

    Classifier classifier = new Classifier(modelPath, dataPath, datasetPath, outputPath, getConf());

    classifier.run();

    if (analyze) {
        double[][] results = classifier.getResults();
        if (results != null) {
            Dataset dataset = Dataset.load(getConf(), datasetPath);
            if (dataset.isNumerical(dataset.getLabelId())) {
                RegressionResultAnalyzer regressionAnalyzer = new RegressionResultAnalyzer();
                regressionAnalyzer.setInstances(results);
                log.info("{}", regressionAnalyzer);
            } else {
                ResultAnalyzer analyzer = new ResultAnalyzer(Arrays.asList(dataset.labels()), "unknown");
                for (double[] res : results) {
                    analyzer.addInstance(dataset.getLabelString(res[0]),
                            new ClassifierResult(dataset.getLabelString(res[1]), 1.0));
                }
                log.info("{}", analyzer);
            }
        }
    }
}

From source file:imageClassify.TestForest.java

License:Apache License

private void sequential() throws IOException {

    log.info("Loading the forest...");
    DecisionForest forest = DecisionForest.load(getConf(), modelPath);

    if (forest == null) {
        log.error("No Decision Forest found!");
        return;//from   w w  w  .ja  va2 s .  c  o  m
    }

    // load the dataset
    Dataset dataset = Dataset.load(getConf(), datasetPath);
    DataConverter converter = new DataConverter(dataset);

    log.info("Sequential classification...");
    long time = System.currentTimeMillis();

    Random rng = RandomUtils.getRandom();

    List<double[]> resList = Lists.newArrayList();
    if (dataFS.getFileStatus(dataPath).isDir()) {
        //the input is a directory of files
        testDirectory(outputPath, converter, forest, dataset, resList, rng);
    } else {
        // the input is one single file
        testFile(dataPath, outputPath, converter, forest, dataset, resList, rng);
    }

    time = System.currentTimeMillis() - time;
    log.info("Classification Time: {}", DFUtils.elapsedTime(time));

    if (analyze) {
        if (dataset.isNumerical(dataset.getLabelId())) {
            RegressionResultAnalyzer regressionAnalyzer = new RegressionResultAnalyzer();
            double[][] results = new double[resList.size()][2];
            regressionAnalyzer.setInstances(resList.toArray(results));
            log.info("{}", regressionAnalyzer);
        } else {
            ResultAnalyzer analyzer = new ResultAnalyzer(Arrays.asList(dataset.labels()), "unknown");
            for (double[] r : resList) {
                analyzer.addInstance(dataset.getLabelString(r[0]),
                        new ClassifierResult(dataset.getLabelString(r[1]), 1.0));
            }
            log.info("{}", analyzer);
        }
    }
}

From source file:javaapplication3.runRandomForest.java

public static void main(String[] args) throws InterruptedException, IOException, ClassNotFoundException {

    String outputFile = "data/lule24";
    String inputFile = "data/DataFraud1MTest.csv";
    String modelFile = "data/forest.seq";
    String infoFile = "data/DataFraud1M.info";

    Path dataPath = new Path(inputFile); // test data path
    Path datasetPath = new Path(infoFile);
    Path modelPath = new Path(modelFile); // path where the forest is stored
    Path outputPath = new Path(outputFile); // path to predictions file, if null do not output the predictions

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    /*/*from ww w . java 2s  . c  o  m*/
    p = Runtime.getRuntime().exec("bash /home/ivan/hadoop-1.2.1/bin/start-all.sh");
    p.waitFor();*/

    if (outputPath == null) {
        throw new IllegalArgumentException(
                "You must specify the ouputPath when using the mapreduce implementation");
    }

    Classifier classifier = new Classifier(modelPath, dataPath, datasetPath, outputPath, conf);

    classifier.run();

    double[][] results = classifier.getResults();

    if (results != null) {

        Dataset dataset = Dataset.load(conf, datasetPath);
        Data data = DataLoader.loadData(dataset, fs, dataPath);

        Instance inst;

        for (int i = 0; i < data.size(); i++) {
            inst = data.get(i);

            //System.out.println("Prediction:"+inst.get(7)+" Real value:"+results[i][1]);
            System.out.println(inst.get(0) + " " + inst.get(1) + " " + inst.get(2) + " " + inst.get(3) + " "
                    + inst.get(4) + " " + inst.get(5) + " " + inst.get(6) + " " + inst.get(7) + " ");
        }

        ResultAnalyzer analyzer = new ResultAnalyzer(Arrays.asList(dataset.labels()), "unknown");

        for (double[] res : results) {
            analyzer.addInstance(dataset.getLabelString(res[0]),
                    new ClassifierResult(dataset.getLabelString(res[1]), 1.0));
            System.out.println("Prvi shit:" + res[0] + " Drugi Shit" + res[1]);
        }

        System.out.println(analyzer.toString());

    }

}

From source file:javaapplication3.RunRandomForestSeq.java

public static void main(String[] args) throws IOException {

    String outputFile = "data/out";
    String inputFile = "data/DataFraud1MTest.csv";
    String modelFile = "data/forest.seq";
    String infoFile = "data/DataFraud1M.info";

    Path dataPath = new Path(inputFile); // test data path
    Path datasetPath = new Path(infoFile);
    Path modelPath = new Path(modelFile); // path where the forest is stored
    Path outputPath = new Path(outputFile); // path to predictions file, if null do not output the predictions

    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);

    FileSystem outFS = FileSystem.get(conf);

    //log.info("Loading the forest...");
    System.out.println("Loading the forest");
    DecisionForest forest = DecisionForest.load(conf, modelPath);

    if (forest == null)
        System.err.println("No decision forest found!");
    //log.error("No Decision Forest found!");

    // load the dataset
    Dataset dataset = Dataset.load(conf, datasetPath);
    DataConverter converter = new DataConverter(dataset);

    //log.info("Sequential classification...");
    System.out.println("Sequential classification");
    long time = System.currentTimeMillis();

    Random rng = RandomUtils.getRandom();

    List<double[]> resList = Lists.newArrayList();
    if (fs.getFileStatus(dataPath).isDir()) {
        //the input is a directory of files
        testDirectory(outputPath, converter, forest, dataset, resList, rng, fs, dataPath, outFS);
    } else {/*from  w  w w.  j a  v a 2 s . co  m*/
        // the input is one single file
        testFile(dataPath, outputPath, converter, forest, dataset, resList, rng, outFS, fs);
    }

    time = System.currentTimeMillis() - time;
    //log.info("Classification Time: {}", DFUtils.elapsedTime(time));
    System.out.println("Classification time: " + DFUtils.elapsedTime(time));

    if (dataset.isNumerical(dataset.getLabelId())) {

        RegressionResultAnalyzer regressionAnalyzer = new RegressionResultAnalyzer();
        double[][] results = new double[resList.size()][2];
        regressionAnalyzer.setInstances(resList.toArray(results));
        //log.info("{}", regressionAnalyzer);
        System.out.println(regressionAnalyzer.toString());

    } else {
        ResultAnalyzer analyzer = new ResultAnalyzer(Arrays.asList(dataset.labels()), "unknown");
        for (double[] r : resList) {
            analyzer.addInstance(dataset.getLabelString(r[0]),
                    new ClassifierResult(dataset.getLabelString(r[1]), 1.0));
        }
        //log.info("{}", analyzer);
        System.out.println(analyzer.toString());
    }

}