List of usage examples for org.apache.mahout.classifier ClassifierResult getLabel
public String getLabel()
From source file:com.grantingersoll.intell.index.BayesUpdateRequestProcessor.java
License:Apache License
@Override public void processAdd(AddUpdateCommand cmd) throws IOException { SolrInputDocument doc = cmd.getSolrInputDocument(); ClassifierResult result = classifyDocument(doc); if (result != null && result.getLabel() != NULL) { doc.addField(outputField, result.getLabel()); }/*from w w w . j a v a 2 s . c om*/ super.processAdd(cmd); }
From source file:com.tamingtext.classifier.bayes.BayesUpdateRequestProcessor.java
License:Apache License
public void classifyDocument(SolrInputDocument doc) throws IOException { try {//w w w .j a va 2s .c o m //<start id="mahout.bayes.classify"/> SolrInputField field = doc.getField(inputField); String[] tokens = tokenizeField(inputField, field); ClassifierResult result = ctx.classifyDocument(tokens, defaultCategory); if (result != null && result.getLabel() != NO_LABEL) { doc.addField(outputField, result.getLabel()); } //<end id="mahout.bayes.classify"/> } catch (InvalidDatastoreException e) { throw new IOException("Invalid Classifier Datastore", e); } }
From source file:com.tamingtext.classifier.bayes.ClassifyDocument.java
License:Apache License
public static void main(String[] args) { log.info("Command-line arguments: " + Arrays.toString(args)); DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option inputOpt = obuilder.withLongName("input").withRequired(true) .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create()) .withDescription("Input file").withShortName("i").create(); Option modelOpt = obuilder.withLongName("model").withRequired(true) .withArgument(abuilder.withName("model").withMinimum(1).withMaximum(1).create()) .withDescription("Model to use when classifying data").withShortName("m").create(); Option helpOpt = obuilder.withLongName("help").withDescription("Print out help").withShortName("h") .create();/*from www . j a v a 2 s .c o m*/ Group group = gbuilder.withName("Options").withOption(inputOpt).withOption(modelOpt).withOption(helpOpt) .create(); try { Parser parser = new Parser(); parser.setGroup(group); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } File inputFile = new File(cmdLine.getValue(inputOpt).toString()); if (!inputFile.isFile()) { throw new IllegalArgumentException(inputFile + " does not exist or is not a file"); } File modelDir = new File(cmdLine.getValue(modelOpt).toString()); if (!modelDir.isDirectory()) { throw new IllegalArgumentException(modelDir + " does not exist or is not a directory"); } BayesParameters p = new BayesParameters(); p.set("basePath", modelDir.getCanonicalPath()); Datastore ds = new InMemoryBayesDatastore(p); Algorithm a = new BayesAlgorithm(); ClassifierContext ctx = new ClassifierContext(a, ds); ctx.initialize(); //TODO: make the analyzer configurable StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); TokenStream ts = analyzer.tokenStream(null, new InputStreamReader(new FileInputStream(inputFile), "UTF-8")); ArrayList<String> tokens = new ArrayList<String>(1000); while (ts.incrementToken()) { tokens.add(ts.getAttribute(CharTermAttribute.class).toString()); } String[] document = tokens.toArray(new String[tokens.size()]); ClassifierResult[] cr = ctx.classifyDocument(document, "unknown", 5); for (ClassifierResult r : cr) { System.err.println(r.getLabel() + "\t" + r.getScore()); } } catch (OptionException e) { log.error("Exception", e); CommandLineUtil.printHelp(group); } catch (IOException e) { log.error("IOException", e); } catch (InvalidDatastoreException e) { log.error("InvalidDataStoreException", e); } finally { } }
From source file:com.tamingtext.classifier.mlt.TestMoreLikeThis.java
License:Apache License
public static void main(String[] args) throws Exception { DefaultOptionBuilder obuilder = new DefaultOptionBuilder(); ArgumentBuilder abuilder = new ArgumentBuilder(); GroupBuilder gbuilder = new GroupBuilder(); Option helpOpt = DefaultOptionCreator.helpOption(); Option inputDirOpt = obuilder.withLongName("input").withRequired(true) .withArgument(abuilder.withName("input").withMinimum(1).withMaximum(1).create()) .withDescription("The input directory").withShortName("i").create(); Option modelOpt = obuilder.withLongName("model").withRequired(true) .withArgument(abuilder.withName("index").withMinimum(1).withMaximum(1).create()) .withDescription("The directory containing the index model").withShortName("m").create(); Option categoryFieldOpt = obuilder.withLongName("categoryField").withRequired(true) .withArgument(abuilder.withName("index").withMinimum(1).withMaximum(1).create()) .withDescription("Name of the field containing category information").withShortName("catf") .create();//from www.ja v a2 s . com Option contentFieldOpt = obuilder.withLongName("contentField").withRequired(true) .withArgument(abuilder.withName("index").withMinimum(1).withMaximum(1).create()) .withDescription("Name of the field containing content information").withShortName("contf") .create(); Option maxResultsOpt = obuilder.withLongName("maxResults").withRequired(false) .withArgument(abuilder.withName("gramSize").withMinimum(1).withMaximum(1).create()) .withDescription("Number of results to retrive, default: 10 ").withShortName("r").create(); Option gramSizeOpt = obuilder.withLongName("gramSize").withRequired(false) .withArgument(abuilder.withName("gramSize").withMinimum(1).withMaximum(1).create()) .withDescription("Size of the n-gram. Default Value: 1 ").withShortName("ng").create(); Option typeOpt = obuilder.withLongName("classifierType").withRequired(false) .withArgument(abuilder.withName("classifierType").withMinimum(1).withMaximum(1).create()) .withDescription("Type of classifier: knn|tfidf. Default: bayes").withShortName("type").create(); Group group = gbuilder.withName("Options").withOption(gramSizeOpt).withOption(helpOpt) .withOption(inputDirOpt).withOption(modelOpt).withOption(typeOpt).withOption(contentFieldOpt) .withOption(categoryFieldOpt).withOption(maxResultsOpt).create(); try { Parser parser = new Parser(); parser.setGroup(group); parser.setHelpOption(helpOpt); CommandLine cmdLine = parser.parse(args); if (cmdLine.hasOption(helpOpt)) { CommandLineUtil.printHelp(group); return; } String classifierType = (String) cmdLine.getValue(typeOpt); int gramSize = 1; if (cmdLine.hasOption(gramSizeOpt)) { gramSize = Integer.parseInt((String) cmdLine.getValue(gramSizeOpt)); } int maxResults = 10; if (cmdLine.hasOption(maxResultsOpt)) { maxResults = Integer.parseInt((String) cmdLine.getValue(maxResultsOpt)); } String inputPath = (String) cmdLine.getValue(inputDirOpt); String modelPath = (String) cmdLine.getValue(modelOpt); String categoryField = (String) cmdLine.getValue(categoryFieldOpt); String contentField = (String) cmdLine.getValue(contentFieldOpt); MatchMode mode; if ("knn".equalsIgnoreCase(classifierType)) { mode = MatchMode.KNN; } else if ("tfidf".equalsIgnoreCase(classifierType)) { mode = MatchMode.TFIDF; } else { throw new IllegalArgumentException("Unkown classifierType: " + classifierType); } Directory directory = FSDirectory.open(new File(modelPath)); IndexReader indexReader = IndexReader.open(directory); Analyzer analyzer //<co id="mlt.analyzersetup"/> = new EnglishAnalyzer(Version.LUCENE_36); MoreLikeThisCategorizer categorizer = new MoreLikeThisCategorizer(indexReader, categoryField); categorizer.setAnalyzer(analyzer); categorizer.setMatchMode(mode); categorizer.setFieldNames(new String[] { contentField }); categorizer.setMaxResults(maxResults); categorizer.setNgramSize(gramSize); File f = new File(inputPath); if (!f.isDirectory()) { throw new IllegalArgumentException(f + " is not a directory or does not exit"); } File[] inputFiles = FileUtil.buildFileList(f); String line = null; //<start id="lucene.examples.mlt.test"/> final ClassifierResult UNKNOWN = new ClassifierResult("unknown", 1.0); ResultAnalyzer resultAnalyzer = //<co id="co.mlt.ra"/> new ResultAnalyzer(categorizer.getCategories(), UNKNOWN.getLabel()); for (File ff : inputFiles) { //<co id="co.mlt.read"/> BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(ff), "UTF-8")); while ((line = in.readLine()) != null) { String[] parts = line.split("\t"); if (parts.length != 2) { continue; } CategoryHits[] hits //<co id="co.mlt.cat"/> = categorizer.categorize(new StringReader(parts[1])); ClassifierResult result = hits.length > 0 ? hits[0] : UNKNOWN; resultAnalyzer.addInstance(parts[0], result); //<co id="co.mlt.an"/> } in.close(); } System.out.println(resultAnalyzer.toString());//<co id="co.mlt.print"/> /* <calloutlist> <callout arearefs="co.mlt.ra">Create <classname>ResultAnalyzer</classname></callout> <callout arearefs="co.mlt.read">Read Test data</callout> <callout arearefs="co.mlt.cat">Categorize</callout> <callout arearefs="co.mlt.an">Collect Results</callout> <callout arearefs="co.mlt.print">Display Results</callout> </calloutlist> */ //<end id="lucene.examples.mlt.test"/> } catch (OptionException e) { log.error("Error while parsing options", e); } }
From source file:de.tu_berlin.dima.aim3.naivebayes.classifier.ClassifyingMapper.java
License:Open Source License
@Override public void map(Label correctLabel, FeatureList features, Collector<LabelPair, PactInteger> out) { if (firstCall) { LOG.info("Reading model."); try {/*w w w. j a v a 2s . co m*/ Algorithm algorithm = new BayesAlgorithm(); // TODO: Support cbayes BayesParameters params = new BayesParameters(); params.setBasePath(modelBasePath); Datastore datastore = new PactBayesDatastore(params); classifier = new ClassifierContext(algorithm, datastore); classifier.initialize(); // defaultCategory = parameters.getString("", ""); // gramSize = params.getGramSize(); } catch (InvalidDatastoreException e) { } LOG.info("Reading model finished"); firstCall = false; } //TODO: Use gramsSize String[] document = new String[features.size()]; int i = 0; for (Feature feature : features) { document[i++] = feature.toString(); } try { ClassifierResult result = classifier.classifyDocument(document, defaultCategory); LabelPair labels = new LabelPair(); labels.setFirst(correctLabel); labels.setSecond(new Label(result.getLabel().getBytes())); out.collect(labels, ONE); } catch (InvalidDatastoreException e) { throw new RuntimeException(e); } }