Example usage for com.google.common.collect Multiset add

Introduction

In this page you can find the example usage for com.google.common.collect Multiset add.

Prototype

@Override
boolean add(E element);

Source Link

Document

Adds a single occurrence of the specified element to this multiset.

Usage

From source file:org.apache.niolex.common.guava.GuavaCollections.java

/**
 * @param args//from   w ww  .ja va2s.  c om
 */
public static void main(String[] args) {
    Multiset<String> wordsMultiset = HashMultiset.create();
    wordsMultiset.add("abc");
    wordsMultiset.add("abc");
    wordsMultiset.add("abcd");
    System.out.println("count => " + wordsMultiset.count("abc"));
    System.out.println("count => " + wordsMultiset.count("abcd"));

    BiMap<String, String> biMap = HashBiMap.create();
    biMap.put("good", "morning");
    biMap.put("bad", "afternoon");
    System.out.println("good => " + biMap.get("good"));
    System.out.println("afternoon => " + biMap.inverse().get("afternoon"));

    RangeMap<Integer, String> rangeMap = TreeRangeMap.create();
    rangeMap.put(Range.closed(1, 11), "Nice");
    rangeMap.put(Range.openClosed(11, 15), "Girl");
    System.out.println("11 => " + rangeMap.get(11));
    System.out.println("12 => " + rangeMap.get(12));
    System.out.println("15 => " + rangeMap.get(15));
    System.out.println("16 => " + rangeMap.get(16));

    List<Integer> countUp = Ints.asList(1, 2, 3, 4, 5);
    List<Integer> countDown = Lists.reverse(countUp); // {5, 4, 3, 2, 1}
    System.out.println("countUp => " + countUp);
    System.out.println("countDown => " + countDown);
}

From source file:edu.byu.nlp.data.app.AnnotationStream2Csv.java

public static void main(String[] args) throws IOException {
    // parse CLI arguments
    new ArgumentParser(AnnotationStream2Csv.class).parseArgs(args);
    Preconditions.checkNotNull(jsonStream, "You must provide a valid --json-stream!");

    Dataset data = readData(jsonStream);

    // optionally aggregate by instance
    String header = "annotator,start,end,annotation,label,source,num_correct_annotations,num_annotations,cum_num_annotations,num_annotators,cum_num_annotators\n";

    // iterate over instances and (optionally) annotations
    final StringBuilder bld = new StringBuilder();

    switch (row) {
    case ANNOTATION:

        // sort all annotations by end time
        Map<FlatInstance<SparseFeatureVector, Integer>, DatasetInstance> ann2InstMap = Maps
                .newIdentityHashMap();/* w ww  .j  av a 2 s .c o  m*/
        List<FlatInstance<SparseFeatureVector, Integer>> annotationList = Lists.newArrayList();
        for (DatasetInstance inst : data) {
            for (FlatInstance<SparseFeatureVector, Integer> ann : inst.getAnnotations().getRawAnnotations()) {
                ann2InstMap.put(ann, inst); // record instance of each annotations
                annotationList.add(ann);
            }
        }
        Collections.sort(annotationList, new Comparator<FlatInstance<SparseFeatureVector, Integer>>() {
            @Override
            public int compare(FlatInstance<SparseFeatureVector, Integer> o1,
                    FlatInstance<SparseFeatureVector, Integer> o2) {
                // no null checking since we want to fail if annotation time is not set. 
                return Long.compare(o1.getEndTimestamp(), o2.getEndTimestamp());
            }
        });

        Set<Integer> annotators = Sets.newHashSet();
        for (Enumeration<FlatInstance<SparseFeatureVector, Integer>> item : Iterables2
                .enumerate(annotationList)) {
            FlatInstance<SparseFeatureVector, Integer> ann = item.getElement();
            DatasetInstance inst = ann2InstMap.get(ann);
            annotators.add(ann.getAnnotator());

            bld.append(ann.getAnnotator() + ",");
            bld.append(ann.getStartTimestamp() + ",");
            bld.append(ann.getEndTimestamp() + ",");
            bld.append(ann.getAnnotation() + ",");
            bld.append(inst.getLabel() + ",");
            bld.append(
                    data.getInfo().getIndexers().getInstanceIdIndexer().get(inst.getInfo().getSource()) + ",");
            bld.append((!inst.hasLabel() ? "NA" : ann.getAnnotation() == inst.getLabel() ? 1 : 0) + ","); // num correct
            bld.append(1 + ","); // num annotations
            bld.append((item.getIndex() + 1) + ","); // cumulative num annotations
            bld.append(1 + ","); // num annotators
            bld.append(annotators.size() + ""); // cumulative num annotators
            bld.append("\n");
        }
        break;
    case INSTANCE:
        int cumNumAnnotations = 0;
        for (DatasetInstance inst : data) {
            cumNumAnnotations += inst.getInfo().getNumAnnotations();

            int numCorrectAnnotations = 0;
            // sum over all the annotators who put the correct answer (if available)
            if (inst.hasLabel()) {
                Integer correctLabel = inst.getLabel();
                for (int j = 0; j < data.getInfo().getNumAnnotators(); j++) {
                    numCorrectAnnotations += inst.getAnnotations().getLabelAnnotations()
                            .getRow(j)[correctLabel];
                }
            }

            bld.append("NA,");
            bld.append("NA,");
            bld.append("NA,");
            bld.append("NA,");
            bld.append(inst.getLabel() + ",");
            bld.append(inst.getInfo().getSource() + ",");
            bld.append(numCorrectAnnotations + ",");
            bld.append(inst.getInfo().getNumAnnotations() + ",");
            bld.append(cumNumAnnotations + ",");
            bld.append(inst.getInfo().getNumAnnotators() + ",");
            bld.append("NA"); // cumulative num annotators
            bld.append("\n");
        }
        break;

    case ANNOTATOR:
        Multiset<Integer> perAnnotatorAnnotationCounts = HashMultiset.create();
        Multiset<Integer> perAnnotatorCorrectAnnotationCounts = HashMultiset.create();
        for (DatasetInstance inst : data) {
            for (FlatInstance<SparseFeatureVector, Integer> ann : inst.getAnnotations().getRawAnnotations()) {
                int annotatorId = ann.getAnnotator();

                perAnnotatorAnnotationCounts.add(annotatorId);

                if (inst.getLabel() == ann.getAnnotation()) {
                    perAnnotatorCorrectAnnotationCounts.add(annotatorId);
                }

            }
        }

        for (String annotatorId : data.getInfo().getAnnotatorIdIndexer()) {

            bld.append(annotatorId + ",");
            bld.append("NA,");
            bld.append("NA,");
            bld.append("NA,");
            bld.append("NA,");
            bld.append("NA,");
            bld.append(perAnnotatorCorrectAnnotationCounts.count(annotatorId) + ",");
            bld.append(perAnnotatorAnnotationCounts.count(annotatorId) + ",");
            bld.append("NA,");
            bld.append("1,"); // num annotators
            bld.append("NA"); // cumulative num annotators
            bld.append("\n");
        }

        break;

    default:
        Preconditions.checkArgument(false, "unknown row type: " + row);
        break;
    }

    // output to console
    if (out == null) {
        System.out.println(header);
        System.out.println(bld.toString());
    } else {
        File outfile = new File(out);
        Files.write(header, outfile, Charsets.UTF_8);
        Files.append(bld, outfile, Charsets.UTF_8);
    }

}

From source file:com.umaircheema.mahout.utils.classifiers.NaiveBayesClassifier.java

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Mahout Naive Bayesian Classifier");
        System.out.println(//from   www  . jav  a2 s  . co  m
                "Classifies input text document into a class given a model, dictionary, document frequency and input file");
        System.out.println(
                "Arguments: [model] [label_index] [dictionary] [document-frequency] [input-text-file]");
        return;
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String inputFilePath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from input file
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);

    BufferedReader reader = new BufferedReader(new FileReader(inputFilePath));
    StringBuilder stringBuilder = new StringBuilder();
    String lineSeparator = System.getProperty("line.separator");
    String line = null;
    while ((line = reader.readLine()) != null) {
        stringBuilder.append(line);
        stringBuilder.append(lineSeparator);
    }
    // Close the reader I/O
    reader.close();
    Multiset<String> words = ConcurrentHashMultiset.create();

    // extract words from input file
    TokenStream ts = analyzer.tokenStream("text", new StringReader(stringBuilder.toString()));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    ts.reset();
    int wordCount = 0;
    while (ts.incrementToken()) {
        if (termAtt.length() > 0) {
            String word = ts.getAttribute(CharTermAttribute.class).toString();
            Integer wordId = dictionary.get(word);
            // if the word is not in the dictionary, skip it
            if (wordId != null) {
                words.add(word);
                wordCount++;
            }
        }
    }
    // Fixed error : close ts:TokenStream
    ts.end();
    ts.close();
    // create vector wordId => weight using tfidf
    Vector vector = new RandomAccessSparseVector(10000);
    TFIDF tfidf = new TFIDF();
    for (Multiset.Entry<String> entry : words.entrySet()) {
        String word = entry.getElement();
        int count = entry.getCount();
        Integer wordId = dictionary.get(word);
        Long freq = documentFrequency.get(wordId);
        double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
        vector.setQuick(wordId, tfIdfValue);
    }
    // With the classifier, we get one score for each label
    // The label with the highest score is the one the email is more likely
    // to
    // be associated to

    double bestScore = -Double.MAX_VALUE;
    int bestCategoryId = -1;
    Vector resultVector = classifier.classifyFull(vector);
    for (Element element : resultVector) {
        int categoryId = element.index();
        double score = element.get();
        if (score > bestScore) {
            bestScore = score;
            bestCategoryId = categoryId;
        }

    }
    System.out.println(" Class Labe: => " + labels.get(bestCategoryId));
    System.out.println(" Score: => " + bestScore);

    analyzer.close();

}

From source file:mahout.classifier.Classifier.java

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
        return;//from w w  w. ja  va 2 s. c  o m
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tweetsPath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }

        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
        }
        System.out.println(labels.get(bestCategoryId) + "\t" + tweet);
    }
    analyzer.close();
    reader.close();
}

From source file:org.mahout.example.classifier.naivebayes.Classifier.java

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
        return;/*w w w .  j  ava 2  s . c om*/
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tweetsPath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        System.out.println("Tweet: " + tweetId + "\t" + tweet);

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }

        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
            System.out.print("  " + labels.get(categoryId) + ": " + score);
        }
        System.out.println(" => " + labels.get(bestCategoryId));
    }
    analyzer.close();
    reader.close();
}

From source file:com.chimpler.example.bayes.Classifier.java

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
        return;//from  w  w  w.j  a v  a2  s  .  co  m
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tweetsPath = args[4];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        System.out.println("Tweet: " + tweetId + "\t" + tweet);

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }
        // Fixed error : close ts:TokenStream
        ts.end();
        ts.close();
        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
            System.out.print("  " + labels.get(categoryId) + ": " + score);
        }
        System.out.println(" => " + labels.get(bestCategoryId));
    }
    analyzer.close();
    reader.close();
}

From source file:eu.thebluemountain.customers.dctm.brownbag.badcontentslister.Main.java

public static void main(String[] args) {
    try {/*from   w w w .  ja  va  2  s .  com*/
        Map<Command, Optional<String>> cmds = Command.parse(args);
        if ((cmds.containsKey(Command.HELP)) || (!cmds.containsKey(Command.CONFIG))) {
            usage();
            return;
        }
        final JDBCConfig config = config(cmds.get(Command.CONFIG).get());

        String pwd = config.password.orNull();
        if (null == pwd) {
            Optional<String> opt = passwordOf("database", config.user);
            if (!opt.isPresent()) {
                throw new ExitException(RetCode.ERR_CANCELLED);
            }
            pwd = opt.get();
        }
        try (JDBCConnection from = create(config, pwd); CSVWriter writer = makeLog(config.user)) {
            Stopwatch watch = Stopwatch.createStarted();
            Stores stores = StoresReader.STORESREADER.apply(from);
            System.out.println("spent " + watch.stop() + " to load stores");
            final Function<DecoratedContent, Checks.Result> checker = Checks.checker(stores);
            final Multiset<Checks.Code> codes = TreeMultiset.create();
            watch.reset().start();
            ResponseUI rui = ResponseUI.create(1024, 64);
            try (CloseableIterator<DecoratedContent> it = DCReader.reader(from, stores)) {
                long count = 0L;
                while (it.hasNext()) {
                    DecoratedContent dc = it.next();
                    count++;
                    final Checks.Result result = checker.apply(dc);
                    assert null != result;
                    rui.onResponse(result);
                    final Checks.Code code = result.code;
                    codes.add(code);
                    if (code != Checks.Code.OK) {
                        // we've got an error then ....
                        writer.writeError(dc, result);
                    }
                }
                rui.finish();
                System.out.println("spent " + watch.stop() + " to read " + count + " d.c.");
                System.out.println("stats: " + codes);
                System.out.println("bye");
            }
        }
    } catch (SQLException e) {
        e.printStackTrace(System.err);
        System.err.flush();
        System.out.println();
        usage();
        System.exit(RetCode.ERR_SQL.ordinal());
    } catch (ExitException e) {
        e.exit();
    } catch (RuntimeException | IOException e) {
        e.printStackTrace(System.err);
        System.err.flush();
        System.out.println();
        usage();
        System.exit(RetCode.ERR_OTHER.ordinal());
    }
}

From source file:edu.stanford.rad.naivebayes.ClassifyLines.java

public static void main(String[] args) throws Exception {
    //      if (args.length < 5) {
    //         System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
    //         return;
    //      }//from  w w w. ja v  a2  s .c  om
    //      String modelPath = args[0];
    //      String labelIndexPath = args[1];
    //      String dictionaryPath = args[2];
    //      String documentFrequencyPath = args[3];
    //      String tweetsPath = args[4];

    String modelPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb";
    String labelIndexPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb/labelindex";
    String dictionaryPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/dictionary.file-0";
    String documentFrequencyPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/df-count/part-r-00000";
    String tweetsPath = "/Users/saeedhp/Desktop/tweet/tweet.txt";

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);
    BufferedReader reader = new BufferedReader(new FileReader(tweetsPath));
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] tokens = line.split("\t", 2);
        String tweetId = tokens[0];
        String tweet = tokens[1];

        System.out.println("Tweet: " + tweetId + "\t" + tweet);

        Multiset<String> words = ConcurrentHashMultiset.create();

        // extract words from tweet
        TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet));
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        ts.reset();
        int wordCount = 0;
        while (ts.incrementToken()) {
            if (termAtt.length() > 0) {
                String word = ts.getAttribute(CharTermAttribute.class).toString();
                Integer wordId = dictionary.get(word);
                // if the word is not in the dictionary, skip it
                if (wordId != null) {
                    words.add(word);
                    wordCount++;
                }
            }
        }
        // Fixed error : close ts:TokenStream
        ts.end();
        ts.close();
        // create vector wordId => weight using tfidf
        Vector vector = new RandomAccessSparseVector(10000);
        TFIDF tfidf = new TFIDF();
        for (Multiset.Entry<String> entry : words.entrySet()) {
            String word = entry.getElement();
            int count = entry.getCount();
            Integer wordId = dictionary.get(word);
            Long freq = documentFrequency.get(wordId);
            double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
            vector.setQuick(wordId, tfIdfValue);
        }
        // With the classifier, we get one score for each label 
        // The label with the highest score is the one the tweet is more likely to
        // be associated to
        Vector resultVector = classifier.classifyFull(vector);
        double bestScore = -Double.MAX_VALUE;
        int bestCategoryId = -1;
        for (Element element : resultVector.all()) {
            int categoryId = element.index();
            double score = element.get();
            if (score > bestScore) {
                bestScore = score;
                bestCategoryId = categoryId;
            }
            System.out.print("  " + labels.get(categoryId) + ": " + score);
        }
        System.out.println(" => " + labels.get(bestCategoryId));
    }
    analyzer.close();
    reader.close();
}

From source file:di.uniba.it.wsd.tool.wn.BuildOccSense.java

/**
 * @param args the command line arguments
 *//*from w  w  w .  ja v  a2  s .c o  m*/
public static void main(String[] args) {
    try {
        BufferedReader in = new BufferedReader(new FileReader(new File(args[0])));
        Multiset<String> synset = HashMultiset.create();
        while (in.ready()) {
            String[] values = in.readLine().split("\\s+");
            String[] keys = values[0].split("%");
            String[] poss = keys[1].split(":");
            String offset = null;
            int occ = Integer.parseInt(values[3]);
            if (poss[0].equals("1")) {
                offset = values[1] + "n";
            } else if (poss[0].equals("2")) {
                offset = values[1] + "v";
            } else if (poss[0].equals("3") || poss[0].equals("5")) {
                offset = values[1] + "a";
            } else if (poss[0].equals("4")) {
                offset = values[1] + "r";
            }
            for (int i = 0; i < occ; i++) {
                synset.add(offset);
            }
        }
        in.close();

        BufferedWriter out = new BufferedWriter(new FileWriter(new File(args[1])));
        Iterator<Multiset.Entry<String>> iterator = synset.entrySet().iterator();
        while (iterator.hasNext()) {
            Multiset.Entry<String> entry = iterator.next();
            out.append(entry.getElement()).append("\t").append(String.valueOf(entry.getCount()));
            out.newLine();
        }
        out.close();
    } catch (IOException | NumberFormatException ioex) {
        Logger.getLogger(BuildOccSense.class.getName()).log(Level.SEVERE, "IO Error", ioex);
    }
}

From source file:ClassifierHD.java

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println(/*ww w.j a  v a2 s. c o  m*/
                "Arguments: [model] [label index] [dictionnary] [document frequency] [postgres table] [hdfs dir] [job_id]");
        return;
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tablename = args[4];
    String inputDir = args[5];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);

    Connection conn = null;
    PreparedStatement pstmt = null;

    try {
        Class.forName("org.postgresql.Driver");
        conn = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres",
                "dbwpsdkdl");
        conn.setAutoCommit(false);
        String sql = "INSERT INTO " + tablename
                + " (id,gtime,wtime,target,num,link,body,rep) VALUES (?,?,?,?,?,?,?,?);";
        pstmt = conn.prepareStatement(sql);

        FileSystem fs = FileSystem.get(configuration);
        FileStatus[] status = fs.listStatus(new Path(inputDir));
        BufferedWriter bw = new BufferedWriter(
                new OutputStreamWriter(fs.create(new Path(inputDir + "/rep.list"), true)));

        for (int i = 0; i < status.length; i++) {
            BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(status[i].getPath())));
            if (new String(status[i].getPath().getName()).equals("rep.list")) {
                continue;
            }
            int lv_HEAD = 1;
            int lv_cnt = 0;
            String lv_gtime = null;
            String lv_wtime = null;
            String lv_target = null;
            BigDecimal lv_num = null;
            String lv_link = null;
            String[] lv_args;
            String lv_line;
            StringBuilder lv_txt = new StringBuilder();
            while ((lv_line = br.readLine()) != null) {
                if (lv_cnt < lv_HEAD) {
                    lv_args = lv_line.split(",");
                    lv_gtime = lv_args[0];
                    lv_wtime = lv_args[1];
                    lv_target = lv_args[2];
                    lv_num = new BigDecimal(lv_args[3]);
                    lv_link = lv_args[4];
                } else {
                    lv_txt.append(lv_line + '\n');
                }
                lv_cnt++;
            }
            br.close();

            String id = status[i].getPath().getName();
            String message = lv_txt.toString();

            Multiset<String> words = ConcurrentHashMultiset.create();

            TokenStream ts = analyzer.tokenStream("text", new StringReader(message));
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            ts.reset();
            int wordCount = 0;
            while (ts.incrementToken()) {
                if (termAtt.length() > 0) {
                    String word = ts.getAttribute(CharTermAttribute.class).toString();
                    Integer wordId = dictionary.get(word);
                    if (wordId != null) {
                        words.add(word);
                        wordCount++;
                    }
                }
            }

            ts.end();
            ts.close();

            Vector vector = new RandomAccessSparseVector(10000);
            TFIDF tfidf = new TFIDF();
            for (Multiset.Entry<String> entry : words.entrySet()) {
                String word = entry.getElement();
                int count = entry.getCount();
                Integer wordId = dictionary.get(word);
                Long freq = documentFrequency.get(wordId);
                double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
                vector.setQuick(wordId, tfIdfValue);
            }
            Vector resultVector = classifier.classifyFull(vector);
            double bestScore = -Double.MAX_VALUE;
            int bestCategoryId = -1;
            for (Element element : resultVector.all()) {
                int categoryId = element.index();
                double score = element.get();
                if (score > bestScore) {
                    bestScore = score;
                    bestCategoryId = categoryId;
                }
            }
            //System.out.println(message);
            //System.out.println(" => "+ lv_gtime + lv_wtime + lv_link + id + ":" + labels.get(bestCategoryId));
            pstmt.setString(1, id);
            pstmt.setString(2, lv_gtime);
            pstmt.setString(3, lv_wtime);
            pstmt.setString(4, lv_target);
            pstmt.setBigDecimal(5, lv_num);
            pstmt.setString(6, lv_link);
            pstmt.setString(7, message.substring(1, Math.min(50, message.length())));
            pstmt.setString(8, labels.get(bestCategoryId));
            pstmt.addBatch();
            bw.write(id + "\t" + labels.get(bestCategoryId) + "\n");
        }
        pstmt.executeBatch();
        //pstmt.clearParameters();
        pstmt.close();
        conn.commit();
        conn.close();
        bw.close();
    } catch (Exception e) {
        System.err.println(e.getClass().getName() + ": " + e.getMessage());
        System.exit(0);
    }
    analyzer.close();
}