List of usage examples for com.google.common.collect Multiset add
@Override
boolean add(E element);
From source file:org.apache.niolex.common.guava.GuavaCollections.java
/** * @param args//from w ww .ja va2s. c om */ public static void main(String[] args) { Multiset<String> wordsMultiset = HashMultiset.create(); wordsMultiset.add("abc"); wordsMultiset.add("abc"); wordsMultiset.add("abcd"); System.out.println("count => " + wordsMultiset.count("abc")); System.out.println("count => " + wordsMultiset.count("abcd")); BiMap<String, String> biMap = HashBiMap.create(); biMap.put("good", "morning"); biMap.put("bad", "afternoon"); System.out.println("good => " + biMap.get("good")); System.out.println("afternoon => " + biMap.inverse().get("afternoon")); RangeMap<Integer, String> rangeMap = TreeRangeMap.create(); rangeMap.put(Range.closed(1, 11), "Nice"); rangeMap.put(Range.openClosed(11, 15), "Girl"); System.out.println("11 => " + rangeMap.get(11)); System.out.println("12 => " + rangeMap.get(12)); System.out.println("15 => " + rangeMap.get(15)); System.out.println("16 => " + rangeMap.get(16)); List<Integer> countUp = Ints.asList(1, 2, 3, 4, 5); List<Integer> countDown = Lists.reverse(countUp); // {5, 4, 3, 2, 1} System.out.println("countUp => " + countUp); System.out.println("countDown => " + countDown); }
From source file:edu.byu.nlp.data.app.AnnotationStream2Csv.java
public static void main(String[] args) throws IOException { // parse CLI arguments new ArgumentParser(AnnotationStream2Csv.class).parseArgs(args); Preconditions.checkNotNull(jsonStream, "You must provide a valid --json-stream!"); Dataset data = readData(jsonStream); // optionally aggregate by instance String header = "annotator,start,end,annotation,label,source,num_correct_annotations,num_annotations,cum_num_annotations,num_annotators,cum_num_annotators\n"; // iterate over instances and (optionally) annotations final StringBuilder bld = new StringBuilder(); switch (row) { case ANNOTATION: // sort all annotations by end time Map<FlatInstance<SparseFeatureVector, Integer>, DatasetInstance> ann2InstMap = Maps .newIdentityHashMap();/* w ww .j av a 2 s .c o m*/ List<FlatInstance<SparseFeatureVector, Integer>> annotationList = Lists.newArrayList(); for (DatasetInstance inst : data) { for (FlatInstance<SparseFeatureVector, Integer> ann : inst.getAnnotations().getRawAnnotations()) { ann2InstMap.put(ann, inst); // record instance of each annotations annotationList.add(ann); } } Collections.sort(annotationList, new Comparator<FlatInstance<SparseFeatureVector, Integer>>() { @Override public int compare(FlatInstance<SparseFeatureVector, Integer> o1, FlatInstance<SparseFeatureVector, Integer> o2) { // no null checking since we want to fail if annotation time is not set. return Long.compare(o1.getEndTimestamp(), o2.getEndTimestamp()); } }); Set<Integer> annotators = Sets.newHashSet(); for (Enumeration<FlatInstance<SparseFeatureVector, Integer>> item : Iterables2 .enumerate(annotationList)) { FlatInstance<SparseFeatureVector, Integer> ann = item.getElement(); DatasetInstance inst = ann2InstMap.get(ann); annotators.add(ann.getAnnotator()); bld.append(ann.getAnnotator() + ","); bld.append(ann.getStartTimestamp() + ","); bld.append(ann.getEndTimestamp() + ","); bld.append(ann.getAnnotation() + ","); bld.append(inst.getLabel() + ","); bld.append( data.getInfo().getIndexers().getInstanceIdIndexer().get(inst.getInfo().getSource()) + ","); bld.append((!inst.hasLabel() ? "NA" : ann.getAnnotation() == inst.getLabel() ? 1 : 0) + ","); // num correct bld.append(1 + ","); // num annotations bld.append((item.getIndex() + 1) + ","); // cumulative num annotations bld.append(1 + ","); // num annotators bld.append(annotators.size() + ""); // cumulative num annotators bld.append("\n"); } break; case INSTANCE: int cumNumAnnotations = 0; for (DatasetInstance inst : data) { cumNumAnnotations += inst.getInfo().getNumAnnotations(); int numCorrectAnnotations = 0; // sum over all the annotators who put the correct answer (if available) if (inst.hasLabel()) { Integer correctLabel = inst.getLabel(); for (int j = 0; j < data.getInfo().getNumAnnotators(); j++) { numCorrectAnnotations += inst.getAnnotations().getLabelAnnotations() .getRow(j)[correctLabel]; } } bld.append("NA,"); bld.append("NA,"); bld.append("NA,"); bld.append("NA,"); bld.append(inst.getLabel() + ","); bld.append(inst.getInfo().getSource() + ","); bld.append(numCorrectAnnotations + ","); bld.append(inst.getInfo().getNumAnnotations() + ","); bld.append(cumNumAnnotations + ","); bld.append(inst.getInfo().getNumAnnotators() + ","); bld.append("NA"); // cumulative num annotators bld.append("\n"); } break; case ANNOTATOR: Multiset<Integer> perAnnotatorAnnotationCounts = HashMultiset.create(); Multiset<Integer> perAnnotatorCorrectAnnotationCounts = HashMultiset.create(); for (DatasetInstance inst : data) { for (FlatInstance<SparseFeatureVector, Integer> ann : inst.getAnnotations().getRawAnnotations()) { int annotatorId = ann.getAnnotator(); perAnnotatorAnnotationCounts.add(annotatorId); if (inst.getLabel() == ann.getAnnotation()) { perAnnotatorCorrectAnnotationCounts.add(annotatorId); } } } for (String annotatorId : data.getInfo().getAnnotatorIdIndexer()) { bld.append(annotatorId + ","); bld.append("NA,"); bld.append("NA,"); bld.append("NA,"); bld.append("NA,"); bld.append("NA,"); bld.append(perAnnotatorCorrectAnnotationCounts.count(annotatorId) + ","); bld.append(perAnnotatorAnnotationCounts.count(annotatorId) + ","); bld.append("NA,"); bld.append("1,"); // num annotators bld.append("NA"); // cumulative num annotators bld.append("\n"); } break; default: Preconditions.checkArgument(false, "unknown row type: " + row); break; } // output to console if (out == null) { System.out.println(header); System.out.println(bld.toString()); } else { File outfile = new File(out); Files.write(header, outfile, Charsets.UTF_8); Files.append(bld, outfile, Charsets.UTF_8); } }
From source file:com.umaircheema.mahout.utils.classifiers.NaiveBayesClassifier.java
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println("Mahout Naive Bayesian Classifier"); System.out.println(//from www . jav a2 s . co m "Classifies input text document into a class given a model, dictionary, document frequency and input file"); System.out.println( "Arguments: [model] [label_index] [dictionary] [document-frequency] [input-text-file]"); return; } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String inputFilePath = args[4]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from input file Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); BufferedReader reader = new BufferedReader(new FileReader(inputFilePath)); StringBuilder stringBuilder = new StringBuilder(); String lineSeparator = System.getProperty("line.separator"); String line = null; while ((line = reader.readLine()) != null) { stringBuilder.append(line); stringBuilder.append(lineSeparator); } // Close the reader I/O reader.close(); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from input file TokenStream ts = analyzer.tokenStream("text", new StringReader(stringBuilder.toString())); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // Fixed error : close ts:TokenStream ts.end(); ts.close(); // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the email is more likely // to // be associated to double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; Vector resultVector = classifier.classifyFull(vector); for (Element element : resultVector) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } } System.out.println(" Class Labe: => " + labels.get(bestCategoryId)); System.out.println(" Score: => " + bestScore); analyzer.close(); }
From source file:mahout.classifier.Classifier.java
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]"); return;//from w w w. ja va 2 s. c o m } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tweetsPath = args[4]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); BufferedReader reader = new BufferedReader(new FileReader(tweetsPath)); while (true) { String line = reader.readLine(); if (line == null) { break; } String[] tokens = line.split("\t", 2); String tweetId = tokens[0]; String tweet = tokens[1]; Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } } System.out.println(labels.get(bestCategoryId) + "\t" + tweet); } analyzer.close(); reader.close(); }
From source file:org.mahout.example.classifier.naivebayes.Classifier.java
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]"); return;/*w w w . j ava 2 s . c om*/ } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tweetsPath = args[4]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); BufferedReader reader = new BufferedReader(new FileReader(tweetsPath)); while (true) { String line = reader.readLine(); if (line == null) { break; } String[] tokens = line.split("\t", 2); String tweetId = tokens[0]; String tweet = tokens[1]; System.out.println("Tweet: " + tweetId + "\t" + tweet); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } System.out.print(" " + labels.get(categoryId) + ": " + score); } System.out.println(" => " + labels.get(bestCategoryId)); } analyzer.close(); reader.close(); }
From source file:com.chimpler.example.bayes.Classifier.java
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]"); return;//from w w w.j a v a2 s . co m } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tweetsPath = args[4]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); BufferedReader reader = new BufferedReader(new FileReader(tweetsPath)); while (true) { String line = reader.readLine(); if (line == null) { break; } String[] tokens = line.split("\t", 2); String tweetId = tokens[0]; String tweet = tokens[1]; System.out.println("Tweet: " + tweetId + "\t" + tweet); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // Fixed error : close ts:TokenStream ts.end(); ts.close(); // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } System.out.print(" " + labels.get(categoryId) + ": " + score); } System.out.println(" => " + labels.get(bestCategoryId)); } analyzer.close(); reader.close(); }
From source file:eu.thebluemountain.customers.dctm.brownbag.badcontentslister.Main.java
public static void main(String[] args) { try {/*from w w w . ja va 2 s . com*/ Map<Command, Optional<String>> cmds = Command.parse(args); if ((cmds.containsKey(Command.HELP)) || (!cmds.containsKey(Command.CONFIG))) { usage(); return; } final JDBCConfig config = config(cmds.get(Command.CONFIG).get()); String pwd = config.password.orNull(); if (null == pwd) { Optional<String> opt = passwordOf("database", config.user); if (!opt.isPresent()) { throw new ExitException(RetCode.ERR_CANCELLED); } pwd = opt.get(); } try (JDBCConnection from = create(config, pwd); CSVWriter writer = makeLog(config.user)) { Stopwatch watch = Stopwatch.createStarted(); Stores stores = StoresReader.STORESREADER.apply(from); System.out.println("spent " + watch.stop() + " to load stores"); final Function<DecoratedContent, Checks.Result> checker = Checks.checker(stores); final Multiset<Checks.Code> codes = TreeMultiset.create(); watch.reset().start(); ResponseUI rui = ResponseUI.create(1024, 64); try (CloseableIterator<DecoratedContent> it = DCReader.reader(from, stores)) { long count = 0L; while (it.hasNext()) { DecoratedContent dc = it.next(); count++; final Checks.Result result = checker.apply(dc); assert null != result; rui.onResponse(result); final Checks.Code code = result.code; codes.add(code); if (code != Checks.Code.OK) { // we've got an error then .... writer.writeError(dc, result); } } rui.finish(); System.out.println("spent " + watch.stop() + " to read " + count + " d.c."); System.out.println("stats: " + codes); System.out.println("bye"); } } } catch (SQLException e) { e.printStackTrace(System.err); System.err.flush(); System.out.println(); usage(); System.exit(RetCode.ERR_SQL.ordinal()); } catch (ExitException e) { e.exit(); } catch (RuntimeException | IOException e) { e.printStackTrace(System.err); System.err.flush(); System.out.println(); usage(); System.exit(RetCode.ERR_OTHER.ordinal()); } }
From source file:edu.stanford.rad.naivebayes.ClassifyLines.java
public static void main(String[] args) throws Exception { // if (args.length < 5) { // System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]"); // return; // }//from w w w. ja v a2 s .c om // String modelPath = args[0]; // String labelIndexPath = args[1]; // String dictionaryPath = args[2]; // String documentFrequencyPath = args[3]; // String tweetsPath = args[4]; String modelPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb"; String labelIndexPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb/labelindex"; String dictionaryPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/dictionary.file-0"; String documentFrequencyPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/df-count/part-r-00000"; String tweetsPath = "/Users/saeedhp/Desktop/tweet/tweet.txt"; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); BufferedReader reader = new BufferedReader(new FileReader(tweetsPath)); while (true) { String line = reader.readLine(); if (line == null) { break; } String[] tokens = line.split("\t", 2); String tweetId = tokens[0]; String tweet = tokens[1]; System.out.println("Tweet: " + tweetId + "\t" + tweet); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // Fixed error : close ts:TokenStream ts.end(); ts.close(); // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } System.out.print(" " + labels.get(categoryId) + ": " + score); } System.out.println(" => " + labels.get(bestCategoryId)); } analyzer.close(); reader.close(); }
From source file:di.uniba.it.wsd.tool.wn.BuildOccSense.java
/** * @param args the command line arguments *//*from w w w . ja v a2 s .c o m*/ public static void main(String[] args) { try { BufferedReader in = new BufferedReader(new FileReader(new File(args[0]))); Multiset<String> synset = HashMultiset.create(); while (in.ready()) { String[] values = in.readLine().split("\\s+"); String[] keys = values[0].split("%"); String[] poss = keys[1].split(":"); String offset = null; int occ = Integer.parseInt(values[3]); if (poss[0].equals("1")) { offset = values[1] + "n"; } else if (poss[0].equals("2")) { offset = values[1] + "v"; } else if (poss[0].equals("3") || poss[0].equals("5")) { offset = values[1] + "a"; } else if (poss[0].equals("4")) { offset = values[1] + "r"; } for (int i = 0; i < occ; i++) { synset.add(offset); } } in.close(); BufferedWriter out = new BufferedWriter(new FileWriter(new File(args[1]))); Iterator<Multiset.Entry<String>> iterator = synset.entrySet().iterator(); while (iterator.hasNext()) { Multiset.Entry<String> entry = iterator.next(); out.append(entry.getElement()).append("\t").append(String.valueOf(entry.getCount())); out.newLine(); } out.close(); } catch (IOException | NumberFormatException ioex) { Logger.getLogger(BuildOccSense.class.getName()).log(Level.SEVERE, "IO Error", ioex); } }
From source file:ClassifierHD.java
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println(/*ww w.j a v a2 s. c o m*/ "Arguments: [model] [label index] [dictionnary] [document frequency] [postgres table] [hdfs dir] [job_id]"); return; } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tablename = args[4]; String inputDir = args[5]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); Connection conn = null; PreparedStatement pstmt = null; try { Class.forName("org.postgresql.Driver"); conn = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres", "dbwpsdkdl"); conn.setAutoCommit(false); String sql = "INSERT INTO " + tablename + " (id,gtime,wtime,target,num,link,body,rep) VALUES (?,?,?,?,?,?,?,?);"; pstmt = conn.prepareStatement(sql); FileSystem fs = FileSystem.get(configuration); FileStatus[] status = fs.listStatus(new Path(inputDir)); BufferedWriter bw = new BufferedWriter( new OutputStreamWriter(fs.create(new Path(inputDir + "/rep.list"), true))); for (int i = 0; i < status.length; i++) { BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(status[i].getPath()))); if (new String(status[i].getPath().getName()).equals("rep.list")) { continue; } int lv_HEAD = 1; int lv_cnt = 0; String lv_gtime = null; String lv_wtime = null; String lv_target = null; BigDecimal lv_num = null; String lv_link = null; String[] lv_args; String lv_line; StringBuilder lv_txt = new StringBuilder(); while ((lv_line = br.readLine()) != null) { if (lv_cnt < lv_HEAD) { lv_args = lv_line.split(","); lv_gtime = lv_args[0]; lv_wtime = lv_args[1]; lv_target = lv_args[2]; lv_num = new BigDecimal(lv_args[3]); lv_link = lv_args[4]; } else { lv_txt.append(lv_line + '\n'); } lv_cnt++; } br.close(); String id = status[i].getPath().getName(); String message = lv_txt.toString(); Multiset<String> words = ConcurrentHashMultiset.create(); TokenStream ts = analyzer.tokenStream("text", new StringReader(message)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); if (wordId != null) { words.add(word); wordCount++; } } } ts.end(); ts.close(); Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } } //System.out.println(message); //System.out.println(" => "+ lv_gtime + lv_wtime + lv_link + id + ":" + labels.get(bestCategoryId)); pstmt.setString(1, id); pstmt.setString(2, lv_gtime); pstmt.setString(3, lv_wtime); pstmt.setString(4, lv_target); pstmt.setBigDecimal(5, lv_num); pstmt.setString(6, lv_link); pstmt.setString(7, message.substring(1, Math.min(50, message.length()))); pstmt.setString(8, labels.get(bestCategoryId)); pstmt.addBatch(); bw.write(id + "\t" + labels.get(bestCategoryId) + "\n"); } pstmt.executeBatch(); //pstmt.clearParameters(); pstmt.close(); conn.commit(); conn.close(); bw.close(); } catch (Exception e) { System.err.println(e.getClass().getName() + ": " + e.getMessage()); System.exit(0); } analyzer.close(); }