Example usage for java.util.stream Collectors groupingBy

List of usage examples for java.util.stream Collectors groupingBy

Introduction

In this page you can find the example usage for java.util.stream Collectors groupingBy.

Prototype

public static <T, K> Collector<T, ?, Map<K, List<T>>> groupingBy(Function<? super T, ? extends K> classifier) 

Source Link

Document

Returns a Collector implementing a "group by" operation on input elements of type T , grouping elements according to a classification function, and returning the results in a Map .

Usage

From source file:sbu.srl.rolextract.ArgumentClassifier.java

public void trainAndTest(String trainDir, String testDir)
        throws IOException, FileNotFoundException, ClassNotFoundException, NoSuchMethodException,
        IllegalAccessException, IllegalArgumentException, InvocationTargetException {
    SBURoleTrain trainer = new SBURoleTrain(trainDir.concat("/train.ser"), isMultiClass);
    ArrayList<Sentence> trainData = (ArrayList<Sentence>) FileUtil
            .deserializeFromFile(trainDir.concat("/train.ser"));
    if (isMultiClass) {
        trainer.trainMultiClassClassifier(trainDir);
    } else {/*from   w  ww .java2s  . c o  m*/
        trainer.trainBinaryClassifier(trainDir);
    }

    FileUtil.serializeToFile(trainData, trainDir.concat("/train.ser"));
    SBURolePredict predict = new SBURolePredict(trainDir, testDir.concat("/test.arggold.ser"), isMultiClass);
    predict.performPrediction(testDir.concat("/test.arggold.ser"));
    ArrayList<Sentence> predictedSentences = (ArrayList<Sentence>) FileUtil
            .deserializeFromFile(testDir.concat("/test.argpredict.ser"));
    Map<String, List<Sentence>> groupByProcess = predictedSentences.stream()
            .collect(Collectors.groupingBy(Sentence::getProcessName));

    ArrayList<JSONData> jsonData = SentenceUtil.generateJSONData(groupByProcess);
    SentenceUtil.flushDataToJSON(jsonData, testDir.concat("/test.srlout.json"), false);
    SentenceUtil.flushDataToJSON(jsonData, testDir.concat("/test.srlpredict.json"), true);
}

From source file:sbu.srl.rolextract.ArgumentClassifier.java

public void knowledgeExtractor() throws IOException, FileNotFoundException, ClassNotFoundException,
        NoSuchMethodException, IllegalAccessException, IllegalArgumentException, InvocationTargetException {
    boolean dirCreated = FileUtil.mkDir(outputDir);
    dirCreated = FileUtil.mkDir(outputDir.concat("/train"));
    dirCreated = FileUtil.mkDir(outputDir.concat("/test"));
    if (dirCreated) // this is not a good checking, leave it for now
    {/*  ww  w  .j ava 2s . c  om*/
        // TRAINING
        sentences = (ArrayList<Sentence>) sentences.stream().filter(data -> data.isAnnotated())
                .collect(Collectors.toList());
        FileUtil.serializeToFile(sentences, outputDir.concat("/train/train.ser"));
        SBURoleTrain trainer = new SBURoleTrain(outputDir.concat("/train/train.ser"), isMultiClass);
        trainer.train(outputDir.concat("/train"));
        FileUtil.serializeToFile(sentences, outputDir.concat("/train/train.ser"));

        // Read the knowledge sentences using SPOCK data reader
        SpockDataReader reader = new SpockDataReader(testingFileName, configFileName, true); // process, config, is testing
        reader.readData();
        ArrayList<Sentence> testSentences = reader.getSentences();
        FileUtil.serializeToFile(testSentences, outputDir.concat("/test/test.ser"));
        SBURolePredict predict = new SBURolePredict(outputDir.concat("/train"),
                outputDir.concat("/test/test.ser"), isMultiClass);
        predict.knownAnnotation = false;
        predict.performPrediction(outputDir.concat("/test/test.ser"));
        ArrayList<Sentence> predictedSentences = (ArrayList<Sentence>) FileUtil
                .deserializeFromFile(outputDir.concat("/test/predict.ser"));
        Map<String, List<Sentence>> groupByProcess = predictedSentences.stream()
                .collect(Collectors.groupingBy(Sentence::getProcessName));
        ArrayList<JSONData> jsonData = SentenceUtil.generateJSONData(groupByProcess);
        SentenceUtil.flushDataToJSON(jsonData, outputDir.concat("/test/srlpredict.json"), true);
    }
}

From source file:sbu.srl.rolextract.SpockDataReader.java

public void readData() throws FileNotFoundException, IOException {
    List<String[]> data = new ArrayList<>();
    data = FileUtil.readDataObject(processFileName, "\t");
    mapFieldIdx(data.get(0));/*from   ww w.j  ava 2  s  .c  om*/
    data = data.subList(1, data.size());
    final Map<String, List<String[]>> sentenceMap = data.stream()
            .collect(Collectors.groupingBy(row -> row[fieldIdxMap.get("sentence")]));
    String roles[] = fieldMap.get("role").split(":");

    int totalUniqueSentence = sentenceMap.keySet().size();
    int sentProcessed = 0;
    System.out.println("TOTAL UNIQUE SENTENCE : " + totalUniqueSentence);
    for (String sentenceStr : sentenceMap.keySet()) {
        Sentence sentence = new Sentence(sentenceStr);
        boolean isAnnotated = false;
        sentence.setRawText(sentenceStr);
        sentence.setProcess(sentenceMap.get(sentenceStr).get(0)[fieldIdxMap.get("process")]);
        HashMap<String, ArrayList<ArgumentSpan>> roleAnnotationSpan = new HashMap<String, ArrayList<ArgumentSpan>>();
        for (String[] sentenceData : sentenceMap.get(sentenceStr)) {
            // Adding argument span here
            for (int i = 0; i < roles.length; i++) {
                int roleColumnIdx = fieldIdxMap.get(roles[i]);
                ArrayList<ArgumentSpan> spans = new ArrayList<ArgumentSpan>();
                if (sentenceData[roleColumnIdx].length() > 0) { // IF role filler is not empty
                    // Set role filler
                    List<String> tokens = StanfordTokenizerSingleton.getInstance()
                            .tokenize(sentenceData[roleColumnIdx].trim());
                    List<String> tokenizedRawText = StanfordTokenizerSingleton.getInstance()
                            .tokenize(sentence.getRawText());
                    String[] pattern = new String[tokens.size()];
                    tokens.toArray(pattern);
                    ArrayList<Integer> matchIdxs = getIdxMatchesv2(pattern,
                            tokenizedRawText.toArray(new String[tokenizedRawText.size()]));
                    DependencyTree tree = StanfordDepParserSingleton.getInstance().parse(sentence.getRawText());
                    ArrayList<DependencyNode> arrDepNodes = new ArrayList<DependencyNode>();

                    if (matchIdxs != null) {
                        for (int j = 1; j <= tree.lastKey(); j++) {
                            if (matchIdxs.contains(j)) {
                                arrDepNodes.add(tree.get(j));
                            }
                        }
                    }
                    ArgumentSpan span = new ArgumentSpan(arrDepNodes, roles[i]);
                    int annotationIdx = fieldIdxMap.get("is" + roles[i]);
                    if (sentenceData[annotationIdx].length() > 0) { // IF contains annotation
                        if (matchIdxs != null) {
                            isAnnotated = true;
                            if (sentenceData[annotationIdx].equalsIgnoreCase("1")) {
                                span.setAnnotatedLabel("1");
                            } else {
                                span.setAnnotatedLabel("-1");
                            }
                        }
                        span.setPattern(sentenceMap.get(sentenceStr).get(0)[fieldIdxMap.get("pattern")]);
                        spans.add(span);
                    }
                    if (isTestingFile) {
                        if (matchIdxs != null) {
                            isAnnotated = true;
                            span.setAnnotatedLabel("-1");
                            span.setPattern(sentenceMap.get(sentenceStr).get(0)[fieldIdxMap.get("pattern")]);
                            spans.add(span);
                        }
                    }
                    // IF THIS IS A TESTING FILE THEN LABEL IT AS -1 
                }
                if (roleAnnotationSpan.get(roles[i]) == null) {
                    roleAnnotationSpan.put(roles[i], spans);
                } else {
                    ArrayList<ArgumentSpan> existingSpans = roleAnnotationSpan.get(roles[i]);
                    existingSpans.addAll(spans);
                    roleAnnotationSpan.put(roles[i], existingSpans);
                }
            }
        }
        if (skipNotAnnotated) {
            if (isAnnotated) {
                sentence.setAnnotated(isAnnotated);
                sentence.setRoleArgAnnotation(roleAnnotationSpan);
                sentences.add(sentence);
            }
        } else {
            sentence.setRoleArgAnnotation(roleAnnotationSpan);
            sentences.add(sentence);
        }
        System.out.println("Sentence processed : " + (++sentProcessed));
    }

    // SET THE ID
    for (int i = 0; i < sentences.size(); i++) {
        sentences.get(i).setId(i);
        int argId = 0;
        ArrayList<ArgumentSpan> spans = sentences.get(i).getAllAnnotatedArgumentSpan();
        for (ArgumentSpan span : spans) {
            span.setId(argId++);
        }
    }
}

From source file:sbu.srl.rolextract.SpockDataReader.java

public void dumpFrameElements(String fileName) throws FileNotFoundException {
    PrintWriter writer = new PrintWriter(fileName);
    Set<String> labels = getRoleLabels();
    Map<String, List<Sentence>> processSentPair = sentences.stream()
            .collect(Collectors.groupingBy(s -> s.getProcessName()));
    for (String process : processSentPair.keySet()) {
        writer.println(process + "\t" + String.join(":", labels));
    }/*from   w w  w  . ja  va  2 s . c  om*/
    writer.close();
}

From source file:sbu.srl.rolextract.SpockDataReader.java

public void generateLexicalUnitFile(String dirName, int frameStartID, int luStartID) throws IOException {
    boolean success = FileUtil.mkDir(dirName);
    if (success) {
        // iterate through each process, give them ID
        int frameID = frameStartID;
        int luID = luStartID;
        Map<String, List<Sentence>> procSentPair = sentences.stream()
                .collect(Collectors.groupingBy(s -> s.getProcessName()));
        for (String process : procSentPair.keySet()) {
            System.out.println(process + "frameID " + (frameID));
            List<Sentence> sentenceArr = procSentPair.get(process);
            for (int i = 0; i < sentenceArr.size(); i++) {
                Sentence currentSent = sentenceArr.get(i);
                String lu = currentSent.getLexicalUnitFrame();
                System.out.println("luID " + (luID));

                // Create file here
                PrintWriter xmlWriter = new PrintWriter(dirName + "/lu" + luID + ".xml");
                xmlWriter.println("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\n"
                        + "<?xml-stylesheet type=\"text/xsl\" href=\"lexUnit.xsl\"?>\n"
                        + "<lexUnit status=\"Finished_Initial\" POS=\"N\" name=\"" + lu + "\" ID=\"" + luID
                        + "\" frame=\"" + process + "\" frameID=\"" + frameID
                        + "\" totalAnnotated=\"13\" xsi:schemaLocation=\"../schema/lexUnit.xsd\" xmlns=\"http://framenet.icsi.berkeley.edu\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\">\n"
                        + "    <header>\n" + "        <frame>\n"
                        + "            <FE fgColor=\"FFFFFF\" bgColor=\"9400D3\" type=\"Core\" abbrev=\"res\" name=\"result\"/>\n"
                        + "            <FE fgColor=\"FFFFFF\" bgColor=\"00008B\" type=\"Core\" abbrev=\"trig\" name=\"trigger\"/>\n"
                        + "            <FE fgColor=\"FFFFFF\" bgColor=\"FFA500\" type=\"Core\" abbrev=\"ena\" name=\"enabler\"/>\n"
                        + "            <FE fgColor=\"FFFFFF\" bgColor=\"0000FF\" type=\"Core\" abbrev=\"und\" name=\"undergoer\"/>\n"
                        + "        </frame>\n" + "    </header>\n" + "</lexUnit>");
                xmlWriter.close();//from   www  . jav a  2 s  . c  o m
                luID++;
            }
            frameID++;
        }

    }
    // extract the lexical unit, give them ID
    // create the XML file as well
}