Example usage for java.util.stream Collectors groupingBy

List of usage examples for java.util.stream Collectors groupingBy

Introduction

In this page you can find the example usage for java.util.stream Collectors groupingBy.

Prototype

public static <T, K> Collector<T, ?, Map<K, List<T>>> groupingBy(Function<? super T, ? extends K> classifier) 

Source Link

Document

Returns a Collector implementing a "group by" operation on input elements of type T , grouping elements according to a classification function, and returning the results in a Map .

Usage

From source file:org.wso2.is.portal.user.client.api.ChallengeQuestionManagerClientServiceImpl.java

@Override
public List<ChallengeQuestionSetEntry> getChallengeQuestionList(String userUniqueId)
        throws IdentityRecoveryException, IdentityStoreException, UserNotFoundException {

    List<ChallengeQuestionSetEntry> challengeQuestionSetEntryList = new ArrayList<ChallengeQuestionSetEntry>();
    if (challengeQuestionManager == null || realmService == null) {
        throw new IdentityRecoveryException("Challenge question manager or Realm service is not available.");
    }//from w  w w.j a v  a2 s . c om
    User user = realmService.getIdentityStore().getUser(userUniqueId);
    List<ChallengeQuestion> challengeQuestions = challengeQuestionManager.getAllChallengeQuestionsForUser(user);
    Map<String, List<ChallengeQuestion>> groupedChallengeQuestionMap = challengeQuestions.stream()
            .collect(Collectors.groupingBy(ChallengeQuestion::getQuestionSetId));
    for (Map.Entry<String, List<ChallengeQuestion>> entry : groupedChallengeQuestionMap.entrySet()) {
        ChallengeQuestionSetEntry challengeQuestionSetEntry = new ChallengeQuestionSetEntry();
        challengeQuestionSetEntry.setChallengeQuestionSetId(encodeChallengeQuestionSetId(entry.getKey()));
        List<ChallengeQuestion> encodedSetIdChallengeQuestionsList = entry.getValue().stream()
                .map(challengeQuestion -> {
                    challengeQuestion.setQuestionSetId(
                            encodeChallengeQuestionSetId(challengeQuestion.getQuestionSetId()));
                    return challengeQuestion;
                }).collect(Collectors.toList());
        challengeQuestionSetEntry.setChallengeQuestionList(encodedSetIdChallengeQuestionsList);
        challengeQuestionSetEntryList.add(challengeQuestionSetEntry);
    }
    return challengeQuestionSetEntryList;
}

From source file:pl.prutkowski.java.playground.java8.TestCollectors.java

/**
 * @param args the command line arguments
 *///from   w w  w . j a v  a  2 s.  c  om
public static void main(String[] args) {
    Map<String, Integer> monthByLen = months.stream()
            .collect(Collectors.toMap(String::toUpperCase, m -> StringUtils.countMatches(m, "e")));

    monthByLen.forEach((month, eCount) -> System.out.println(month + " -> " + eCount));

    System.out.println("---------------------------------");

    Map<Object, List<String>> monthByLen2 = months.stream()
            .collect(Collectors.groupingBy(m -> StringUtils.countMatches(m, "e")));

    monthByLen2.forEach((count, groupedMonths) -> System.out.println(count + " -> " + groupedMonths));

    System.out.println("---------------------------------");

    Double averageLength = months.stream().collect(Collectors.averagingDouble(String::length));
    System.out.println("Average length: " + averageLength);
    System.out.println("---------------------------------");

    Double max = months.stream().collect(Collectors.summarizingDouble(String::length)).getMax();
    System.out.println("Max length: " + max);
    System.out.println("---------------------------------");

    String reduced = months.stream().collect(Collectors.reducing((m1, m2) -> (m1 + ", " + m2))).get();
    System.out.println("Reduced: " + reduced);
    System.out.println("---------------------------------");
    System.out.println(String.join(", ", months));
    System.out.println("---------------------------------");

    List<String> monthsWithZ = months.stream().filter(m -> m.contains("z")).collect(new ListCollector<>());
    System.out.println(monthsWithZ);

}

From source file:sbu.srl.rolextract.ArgumentClassifier.java

public void performCrossValidation(String outputDir, int crossValidation)
        throws IOException, FileNotFoundException, ClassNotFoundException, NoSuchMethodException,
        IllegalAccessException, IllegalArgumentException, InvocationTargetException {
    for (int i = 1; i <= crossValidation; i++) {
        File trainFoldDir = new File(outputDir.concat("/fold-").concat("" + i).concat("/train"));
        File testFoldDir = new File(outputDir.concat("/fold-").concat("" + i).concat("/test"));
        SBURoleTrain trainer = new SBURoleTrain(trainFoldDir.getAbsolutePath().concat("/train.ser"),
                isMultiClass);/*  w  w  w  .  j  av a 2  s  .co m*/
        trainer.train(trainFoldDir.getAbsolutePath());

        SBURolePredict predict = new SBURolePredict(trainFoldDir.getAbsolutePath(),
                testFoldDir.getAbsolutePath().concat("/test.arggold.ser"), isMultiClass);
        predict.performPrediction(testFoldDir.getAbsolutePath().concat("/test.arggold.ser"));

        ArrayList<Sentence> predictedSentences = (ArrayList<Sentence>) FileUtil
                .deserializeFromFile(testFoldDir.getAbsolutePath().concat("/test.argpredict.ser"));
        Map<String, List<Sentence>> groupByProcess = predictedSentences.stream()
                .collect(Collectors.groupingBy(Sentence::getProcessName));

        ArrayList<JSONData> jsonData = SentenceUtil.generateJSONData(groupByProcess);
        SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.srlout.json"),
                false);
        SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.srlpredict.json"),
                true);
        SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.ilppredict.json"),
                true);
        SentenceUtil.flushDataToJSON(jsonData,
                testFoldDir.getAbsolutePath().concat("/test.semaforpredict.json"), true);
        SentenceUtil.flushDataToJSON(jsonData,
                testFoldDir.getAbsolutePath().concat("/test.easysrlpredict.json"), true);

        /*predict.performPredictionEasySRL(testFoldDir.getAbsolutePath().concat("/test.arggold.ser"), 
         outputDir.concat("/fold-" + i).concat("/test/cv." + i + ".test.sentence.sbu"), 
         outputDir.concat("/fold-" + i).concat("/test/cv." + i + ".raw.predict.easysrl"), 
         "./data/modelCCG", outputDir.concat("/fold-" + i));
                
         predictedSentences = (ArrayList<Sentence>) FileUtil.deserializeFromFile(testFoldDir.getAbsolutePath().concat("/test.argeasysrlpredict.ser"));
         groupByProcess = predictedSentences.stream().collect(Collectors.groupingBy(Sentence::getProcessName));
                
         jsonData = SentenceUtil.generateJSONData(groupByProcess);*/
        SentenceUtil.flushDataToJSON(jsonData,
                testFoldDir.getAbsolutePath().concat("/test.easysrlpredict.json"), true);
    }
}

From source file:sbu.srl.rolextract.ArgumentClassifier.java

public void performAblation(String outputDir, int crossValidation)
        throws IOException, FileNotFoundException, ClassNotFoundException, NoSuchMethodException,
        IllegalAccessException, IllegalArgumentException, InvocationTargetException, InterruptedException {
    ArrayList<String> triedFeatures = new ArrayList<String>(
            Arrays.asList(FileUtil.readLinesFromFile("./configSBUProcRel/features.ori")));
    List<String> ablationFeatures = getAblationFeatures("./configSBUProcRel/features.ablation");

    for (int idxAblation = 0; idxAblation < ablationFeatures.size(); idxAblation++) {
        System.out.println("Removing features : " + ablationFeatures.get(idxAblation));
        Thread.sleep(3000);//from   ww  w .  java  2 s. c o m
        List<String> removedFeatures = Arrays.asList(ablationFeatures.get(idxAblation).split(","));
        triedFeatures.removeAll(removedFeatures);
        FileUtil.dumpToFile(triedFeatures, "./configSBUProcRel/features");
        for (int idxFold = 1; idxFold <= crossValidation; idxFold++) {
            File trainFoldDir = new File(outputDir.concat("/fold-").concat("" + idxFold).concat("/train"));
            File testFoldDir = new File(outputDir.concat("/fold-").concat("" + idxFold).concat("/test"));
            SBURoleTrain trainer = new SBURoleTrain(trainFoldDir.getAbsolutePath().concat("/train.ser"),
                    isMultiClass);
            trainer.train(trainFoldDir.getAbsolutePath());

            SBURolePredict predict = new SBURolePredict(trainFoldDir.getAbsolutePath(),
                    testFoldDir.getAbsolutePath().concat("/test.arggold.ser"), isMultiClass);
            predict.performPrediction(testFoldDir.getAbsolutePath().concat("/test.arggold.ser"));

            ArrayList<Sentence> predictedSentences = (ArrayList<Sentence>) FileUtil
                    .deserializeFromFile(testFoldDir.getAbsolutePath().concat("/test.argpredict.ser"));
            Map<String, List<Sentence>> groupByProcess = predictedSentences.stream()
                    .collect(Collectors.groupingBy(Sentence::getProcessName));

            ArrayList<JSONData> jsonData = SentenceUtil.generateJSONData(groupByProcess);
            SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.srlout.json"),
                    false);
            SentenceUtil.flushDataToJSON(jsonData,
                    testFoldDir.getAbsolutePath().concat("/test.srlpredict.json"), true);
            SentenceUtil.flushDataToJSON(jsonData,
                    testFoldDir.getAbsolutePath().concat("/test.ilppredict.json"), true); // dummy
            SentenceUtil.flushDataToJSON(jsonData,
                    testFoldDir.getAbsolutePath().concat("/test.semaforpredict.json"), true);// dummy
            SentenceUtil.flushDataToJSON(jsonData,
                    testFoldDir.getAbsolutePath().concat("/test.easysrlpredict.json"), true);// dummy

        }
        // copy all data to ILP's data folder
        // cp -r outputDir /home/slouvan/NetBeansProjects/ILP/data/
        try {
            ProcessBuilder pb = new ProcessBuilder(
                    "/home/slouvan/NetBeansProjects/SRL-Integrated/script/cpDir.sh", outputDir,
                    "/home/slouvan/NetBeansProjects/ILP/data/");
            //pb.environment().put("param1", )
            Process p = pb.start(); // Start the process.
            p.waitFor(); // Wait for the process to finish.
            StdUtil.printOutput(p);

            pb = new ProcessBuilder("/usr/bin/python", "/home/slouvan/NetBeansProjects/ILP/evaluate.py");
            p = pb.start(); // Start the process.
            p.waitFor(); // Wait for the process to finish.
            StdUtil.printOutput(p);

            System.out.println("Script executed successfully");
        } catch (Exception e) {
            e.printStackTrace();
        }
        String[] lines = FileUtil.readLinesFromFile("/home/slouvan/NetBeansProjects/ILP/stats.txt");
        PrintWriter out = new PrintWriter(
                new BufferedWriter(new FileWriter(GlobalV.PROJECT_DIR + "/ablationNew.txt", true)));
        //more code
        out.println((new Date()).toString() + " Removed features " + removedFeatures);
        out.println("Eval : " + Arrays.toString(lines));
        out.close();

        triedFeatures.addAll(removedFeatures);
    }
}

From source file:sbu.srl.rolextract.ArgumentClassifier.java

public void performGreedySearch(String outputDir, int crossValidation)
        throws FileNotFoundException, IOException, ClassNotFoundException, NoSuchMethodException,
        IllegalAccessException, IllegalArgumentException, InvocationTargetException, InterruptedException {
    // availFeatures  =  Get all available features)
    List<String> availableFeatures = new ArrayList<String>(
            Arrays.asList(FileUtil.readLinesFromFile("./configSBUProcRel/features")));
    int nbFeat = availableFeatures.size();
    ArrayList<String> triedFeatures = Lists.newArrayList();
    while (triedFeatures.size() < nbFeat) {
        double maxF1 = 0.0;
        String bestFeat = "";
        for (int i = 0; i < availableFeatures.size(); i++) {
            String nextFeat = availableFeatures.get(i);
            System.out.println("Trying with " + nextFeat);
            Thread.sleep(5000);//  ww  w.  ja  v  a 2s. co m
            triedFeatures.add(nextFeat);
            FileUtil.dumpToFile(triedFeatures, "./configSBUProcRel/features");

            for (int j = 1; j <= 1; j++) {
                File trainFoldDir = new File(outputDir.concat("/fold-").concat("" + j).concat("/train"));
                File testFoldDir = new File(outputDir.concat("/fold-").concat("" + j).concat("/test"));
                SBURoleTrain trainer = new SBURoleTrain(trainFoldDir.getAbsolutePath().concat("/train.ser"),
                        isMultiClass);
                trainer.train(trainFoldDir.getAbsolutePath());

                SBURolePredict predict = new SBURolePredict(trainFoldDir.getAbsolutePath(),
                        testFoldDir.getAbsolutePath().concat("/test.arggold.ser"), isMultiClass);
                predict.performPrediction(testFoldDir.getAbsolutePath().concat("/test.arggold.ser"));

                ArrayList<Sentence> predictedSentences = (ArrayList<Sentence>) FileUtil
                        .deserializeFromFile(testFoldDir.getAbsolutePath().concat("/test.argpredict.ser"));
                Map<String, List<Sentence>> groupByProcess = predictedSentences.stream()
                        .collect(Collectors.groupingBy(Sentence::getProcessName));

                ArrayList<JSONData> jsonData = SentenceUtil.generateJSONData(groupByProcess);
                SentenceUtil.flushDataToJSON(jsonData,
                        testFoldDir.getAbsolutePath().concat("/test.srlout.json"), false);
                SentenceUtil.flushDataToJSON(jsonData,
                        testFoldDir.getAbsolutePath().concat("/test.srlpredict.json"), true);
                SentenceUtil.flushDataToJSON(jsonData,
                        testFoldDir.getAbsolutePath().concat("/test.ilppredict.json"), true);
                SentenceUtil.flushDataToJSON(jsonData,
                        testFoldDir.getAbsolutePath().concat("/test.semaforpredict.json"), true);
                SentenceUtil.flushDataToJSON(jsonData,
                        testFoldDir.getAbsolutePath().concat("/test.easysrlpredict.json"), true);
            }
            // copy all data to ILP's data folder
            // cp -r outputDir /home/slouvan/NetBeansProjects/ILP/data/
            try {
                ProcessBuilder pb = new ProcessBuilder(
                        "/home/slouvan/NetBeansProjects/SRL-Integrated/script/cpDir.sh", outputDir,
                        "/home/slouvan/NetBeansProjects/ILP/data/");
                //pb.environment().put("param1", )
                Process p = pb.start(); // Start the process.
                p.waitFor(); // Wait for the process to finish.
                StdUtil.printOutput(p);

                pb = new ProcessBuilder("/usr/bin/python", "/home/slouvan/NetBeansProjects/ILP/evaluate.py");
                p = pb.start(); // Start the process.
                p.waitFor(); // Wait for the process to finish.
                StdUtil.printOutput(p);

                System.out.println("Script executed successfully");
            } catch (Exception e) {
                e.printStackTrace();
            }
            String[] lines = FileUtil.readLinesFromFile("/home/slouvan/NetBeansProjects/ILP/f1.txt");
            double currentF1 = Double.parseDouble(lines[0]);
            if (currentF1 > maxF1) {
                maxF1 = currentF1;
                bestFeat = nextFeat;
            }
            triedFeatures.remove(nextFeat);
        }

        triedFeatures.add(bestFeat);
        System.out.println("Features used : " + triedFeatures);
        System.out.println(
                "Best feature at length " + triedFeatures.size() + " is " + bestFeat + " currentF1 : " + maxF1);
        availableFeatures.remove(bestFeat);
        PrintWriter out = new PrintWriter(
                new BufferedWriter(new FileWriter(GlobalV.PROJECT_DIR + "/ablation.txt", true)));
        out.println("Features used : " + triedFeatures);
        //more code
        out.println((new Date()).toString() + " Best feature at length " + triedFeatures.size() + " is "
                + bestFeat + " currentF1 : " + maxF1);
        System.out.println("Tried features length : " + triedFeatures.size() + " NbFeat :" + nbFeat);
        out.close();
        //more code
    }
    //      for each feat from availFeat
    //         add nextFEat to triedFeat
    //         set the feature config file
    //         doCrossVal, output dummy semafor etc
    //         measureF1 {python here} output to a file, read that file
    //         updateMax
    //         remove nextFeat
    //      print best F1 here
    //      add bestFeat to triedFeat
}

From source file:sbu.srl.rolextract.ArgumentClassifier.java

public void performedFeatureAddition(String outputDir, int crossValidation)
        throws FileNotFoundException, IOException, ClassNotFoundException, NoSuchMethodException,
        IllegalAccessException, IllegalArgumentException, InvocationTargetException, InterruptedException {
    List<String> ablationFeatures = getAblationFeatures("./configSBUProcRel/features.ablation");
    ArrayList<String> stepwiseFeatures = new ArrayList<String>();
    for (int idxAblation = 0; idxAblation < ablationFeatures.size(); idxAblation++) {
        double maxF1 = Double.MIN_VALUE;
        ArrayList<String> currentBestFeat = new ArrayList<String>();
        String[] metricsBest = null;
        for (int idxFeat = 0; idxFeat < ablationFeatures.size(); idxFeat++) {
            Thread.sleep(3000);/*from  w  ww .j  a  v  a2  s  .  c o  m*/
            ArrayList<String> addedFeatures = new ArrayList<String>();
            addedFeatures.addAll(Arrays.asList(ablationFeatures.get(idxFeat).split(",")));
            //(ArrayList<String>) Arrays.asList(ablationFeatures.get(idxAblation).split(","));
            boolean triedFeatures = false;

            for (int i = 0; i < addedFeatures.size(); i++) {
                if (stepwiseFeatures.contains(addedFeatures.get(i))) {
                    triedFeatures = true;
                }
            }
            if (triedFeatures) {
                continue;
            }
            System.out.println("Adding features : " + ablationFeatures.get(idxFeat));
            stepwiseFeatures.addAll(addedFeatures);
            FileUtil.dumpToFile(stepwiseFeatures, "./configSBUProcRel/features");

            for (int idxFold = 1; idxFold <= crossValidation; idxFold++) {
                File trainFoldDir = new File(outputDir.concat("/fold-").concat("" + idxFold).concat("/train"));
                File testFoldDir = new File(outputDir.concat("/fold-").concat("" + idxFold).concat("/test"));
                SBURoleTrain trainer = new SBURoleTrain(trainFoldDir.getAbsolutePath().concat("/train.ser"),
                        isMultiClass);
                trainer.train(trainFoldDir.getAbsolutePath());

                SBURolePredict predict = new SBURolePredict(trainFoldDir.getAbsolutePath(),
                        testFoldDir.getAbsolutePath().concat("/test.arggold.ser"), isMultiClass);
                predict.performPrediction(testFoldDir.getAbsolutePath().concat("/test.arggold.ser"));

                ArrayList<Sentence> predictedSentences = (ArrayList<Sentence>) FileUtil
                        .deserializeFromFile(testFoldDir.getAbsolutePath().concat("/test.argpredict.ser"));
                Map<String, List<Sentence>> groupByProcess = predictedSentences.stream()
                        .collect(Collectors.groupingBy(Sentence::getProcessName));

                ArrayList<JSONData> jsonData = SentenceUtil.generateJSONData(groupByProcess);
                SentenceUtil.flushDataToJSON(jsonData,
                        testFoldDir.getAbsolutePath().concat("/test.srlout.json"), false);
                SentenceUtil.flushDataToJSON(jsonData,
                        testFoldDir.getAbsolutePath().concat("/test.srlpredict.json"), true);
                SentenceUtil.flushDataToJSON(jsonData,
                        testFoldDir.getAbsolutePath().concat("/test.ilppredict.json"), true); // dummy
                SentenceUtil.flushDataToJSON(jsonData,
                        testFoldDir.getAbsolutePath().concat("/test.semaforpredict.json"), true);// dummy
                SentenceUtil.flushDataToJSON(jsonData,
                        testFoldDir.getAbsolutePath().concat("/test.easysrlpredict.json"), true);// dummy
                SentenceUtil.flushDataToJSON(jsonData,
                        testFoldDir.getAbsolutePath().concat("/test.fgpredict.json"), true);// dummy

            }
            // copy all data to ILP's data folder
            // cp -r outputDir /home/slouvan/NetBeansProjects/ILP/data/
            copyAndEval(outputDir);
            String[] lines = FileUtil.readLinesFromFile("/home/slouvan/NetBeansProjects/ILP/stats.txt");
            double currentF1 = Double.parseDouble(lines[0].split("\t")[2]);
            if (currentF1 > maxF1) {
                maxF1 = currentF1;
                currentBestFeat = addedFeatures;
                metricsBest = lines;
            }

            stepwiseFeatures.removeAll(addedFeatures);
        }
        PrintWriter out = new PrintWriter(
                new BufferedWriter(new FileWriter(GlobalV.PROJECT_DIR + "/additionNew.txt", true)));
        out.println((new Date()).toString() + " Best features at this stage is  " + currentBestFeat);
        out.println("Eval : " + Arrays.toString(metricsBest));
        stepwiseFeatures.addAll(currentBestFeat);
        out.println("All current features :" + stepwiseFeatures);
        out.close();
    }
}

From source file:sbu.srl.rolextract.ArgumentClassifier.java

public void generateDevSet(String outputDir, int nbFold, List<String> processes)
        throws FileNotFoundException, IOException {

    sentences = (ArrayList<Sentence>) sentences.stream().filter(s -> processes.contains(s.getProcessName()))
            .collect(Collectors.toList());
    Map<String, List<Sentence>> processSentPair = sentences.stream()
            .collect(Collectors.groupingBy(s -> s.getProcessName()));
    int partitionSize = sentences.size() / nbFold;
    int blockSize = 0;
    int currentFoldCnt = 1;

    ArrayList<Sentence> trainingData = new ArrayList<Sentence>();
    ArrayList<Sentence> testingData = new ArrayList<Sentence>();
    HashMap<String, String> testProcessName = new HashMap<String, String>();
    HashMap<String, String> trainingProcessName = new HashMap<String, String>();
    for (String testingProcess : processSentPair.keySet()) {
        System.out.println(//from   ww  w . j  a  v  a2s  .  co  m
                "Process " + testingProcess + " Nb Sentence :" + processSentPair.get(testingProcess).size());
        // if foldNumber is equal to totalFold then
        // keep adding to testData
        if (currentFoldCnt == nbFold) {
            System.out.println("Processing last fold");
            testingData.addAll(processSentPair.get(testingProcess));
            testProcessName.put(testingProcess, testingProcess);
        } // if the block counter still less than partition size AND foldNumber is less than totalFold
          // keep adding to testingData
        else if (blockSize < partitionSize && currentFoldCnt < nbFold) {
            System.out.println("Has not reached the boundary, keep adding testing data");
            blockSize += processSentPair.get(testingProcess).size();
            testingData.addAll(processSentPair.get(testingProcess));
            testProcessName.put(testingProcess, testingProcess);
            System.out.println("BLOCK SIZE : " + blockSize);
        } else {
            System.out.println("Boundary reached, get the training data and flush everything");
            for (String trainingProcess : processSentPair.keySet()) {
                if (testProcessName.get(trainingProcess) == null) {
                    trainingData.addAll(processSentPair.get(trainingProcess));
                    trainingProcessName.put(trainingProcess, trainingProcess);
                }
            }
            System.out.println("Flushing fold " + currentFoldCnt);
            // serialize training & testing processes
            String trainingProcessesStr = Joiner.on("\t").join(trainingProcessName.keySet().iterator());
            String testingProcessessStr = Joiner.on("\t").join(testProcessName.keySet().iterator());
            FileUtil.dumpToFile(trainingProcessesStr,
                    outputDir.concat("/fold-" + currentFoldCnt).concat("/train/train_process_name"));
            FileUtil.dumpToFile(testingProcessessStr,
                    outputDir.concat("/fold-" + currentFoldCnt).concat("/test/test_process_name"));
            System.out.println("Nb Sentence in train" + trainingData.size());
            System.out.println("Nb Sentence in test" + testingData.size());
            FileUtil.serializeToFile(trainingData,
                    outputDir.concat("/fold-" + currentFoldCnt).concat("/train/train.ser"));

            // ==============================================   SEMAFOR ==============================================================================================================================================
            // ============================================================================================================================================================================================
            SpockDataReader.generateSEMAFORFrameAnnotation(trainingData,
                    outputDir.concat("/fold-" + currentFoldCnt)
                            .concat("/train/cv." + currentFoldCnt + ".train.sentences.frame.elements.sbu"),
                    outputDir.concat("/fold-" + currentFoldCnt)
                            .concat("/train/cv." + currentFoldCnt + ".train.sentence.sbu"),
                    semOffset); // DUMP REQUIRED DATA FOR SEMAFOR
            SpockDataReader.dumpRawSentences(testingData, outputDir.concat("/fold-" + currentFoldCnt)
                    .concat("/test/cv." + currentFoldCnt + ".test.sentence.sbu"));
            SpockDataReader.dumpSentenceLexTargetIdxs(testingData, outputDir.concat("/fold-" + currentFoldCnt)
                    .concat("/test/cv." + currentFoldCnt + ".test.process.target"));
            // EXECUTE ./runMalt.sh here
            try {
                ProcessBuilder pb = new ProcessBuilder(MALT_PARSER_PATH,
                        outputDir.concat("/fold-" + currentFoldCnt)
                                .concat("/train/cv." + currentFoldCnt + ".train.sentence.sbu"),
                        outputDir.concat("/fold-" + currentFoldCnt).concat("/train"));
                //pb.environment().put("param1", )
                Process p = pb.start(); // Start the process.
                p.waitFor(); // Wait for the process to finish.
                StdUtil.printOutput(p);
                System.out.println("Script executed successfully");
                AllAnnotationsMergingWithoutNE.mergeAllAnnotations(
                        outputDir.concat("/fold-" + currentFoldCnt).concat("/train/tokenized"),
                        outputDir.concat("/fold-" + currentFoldCnt).concat("/train/conll"),
                        outputDir.concat("/fold-" + currentFoldCnt).concat("/train/tmp"),
                        outputDir.concat("/fold-" + currentFoldCnt)
                                .concat("/train/cv." + currentFoldCnt + ".train.sentences.all.lemma.tags.sbu"));
            } catch (Exception e) {
                e.printStackTrace();
            }
            // ============================================================================================================================================================================================
            // ==============================================   END OF SEMAFOR ==========================================================================================

            FileUtil.serializeToFile(testingData,
                    outputDir.concat("/fold-" + currentFoldCnt).concat("/test/test.arggold.ser"));
            trainingData.clear();
            testingData.clear();

            blockSize = 0;
            currentFoldCnt++;
            testProcessName.clear();
            trainingProcessName.clear();

        }
    }

    // handle for the last fold""
    for (String trainingProcess : processSentPair.keySet()) {
        if (testProcessName.get(trainingProcess) == null) {
            trainingData.addAll(processSentPair.get(trainingProcess));
            trainingProcessName.put(trainingProcess, trainingProcess);
        }
    }
    // serialize training & testing processes
    System.out.println("Flushing fold " + currentFoldCnt);
    String trainingProcessesStr = Joiner.on("\t").join(trainingProcessName.keySet().iterator());
    String testingProcessessStr = Joiner.on("\t").join(testProcessName.keySet().iterator());
    FileUtil.dumpToFile(trainingProcessesStr,
            outputDir.concat("/fold-" + currentFoldCnt).concat("/train/train_process_name"));
    FileUtil.dumpToFile(testingProcessessStr,
            outputDir.concat("/fold-" + currentFoldCnt).concat("/test/test_process_name"));
    System.out.println("Nb Sentence in train" + trainingData.size());
    System.out.println("Nb Sentence in test" + testingData.size());
    FileUtil.serializeToFile(trainingData,
            outputDir.concat("/fold-" + currentFoldCnt).concat("/train/train.ser"));

    // ==============================================   SEMAFOR ==============================================================================================================================================
    // ============================================================================================================================================================================================
    SpockDataReader.generateSEMAFORFrameAnnotation(trainingData,
            outputDir.concat("/fold-" + currentFoldCnt)
                    .concat("/train/cv." + currentFoldCnt + ".train.sentences.frame.elements.sbu"),
            outputDir.concat("/fold-" + currentFoldCnt)
                    .concat("/train/cv." + currentFoldCnt + ".train.sentence.sbu"),
            semOffset); // DUMP REQUIRED DATA FOR SEMAFOR
    SpockDataReader.dumpRawSentences(testingData, outputDir.concat("/fold-" + currentFoldCnt)
            .concat("/test/cv." + currentFoldCnt + ".test.sentence.sbu"));
    SpockDataReader.dumpSentenceLexTargetIdxs(testingData, outputDir.concat("/fold-" + currentFoldCnt)
            .concat("/test/cv." + currentFoldCnt + ".test.process.target"));
    // EXECUTE ./runMalt.sh here
    try {
        ProcessBuilder pb = new ProcessBuilder(MALT_PARSER_PATH,
                outputDir.concat("/fold-" + currentFoldCnt)
                        .concat("/train/cv." + currentFoldCnt + ".train.sentence.sbu"),
                outputDir.concat("/fold-" + currentFoldCnt).concat("/train"));
        //pb.environment().put("param1", )
        Process p = pb.start(); // Start the process.
        p.waitFor(); // Wait for the process to finish.
        StdUtil.printOutput(p);
        System.out.println("Script executed successfully");
        AllAnnotationsMergingWithoutNE.mergeAllAnnotations(
                outputDir.concat("/fold-" + currentFoldCnt).concat("/train/tokenized"),
                outputDir.concat("/fold-" + currentFoldCnt).concat("/train/conll"),
                outputDir.concat("/fold-" + currentFoldCnt).concat("/train/tmp"),
                outputDir.concat("/fold-" + currentFoldCnt)
                        .concat("/train/cv." + currentFoldCnt + ".train.sentences.all.lemma.tags.sbu"));
    } catch (Exception e) {
        e.printStackTrace();
    }
    // ============================================================================================================================================================================================
    // ==============================================   END OF SEMAFOR ==========================================================================================

    FileUtil.serializeToFile(testingData,
            outputDir.concat("/fold-" + currentFoldCnt).concat("/test/test.arggold.ser"));
}

From source file:sbu.srl.rolextract.ArgumentClassifier.java

public void distributeCrossValidationByProcess(String outputDir, int nbFold)
        throws FileNotFoundException, IOException, InterruptedException {
    // /* www .j  a va  2  s . c  o m*/
    Map<String, List<Sentence>> processSentPair = sentences.stream()
            .collect(Collectors.groupingBy(s -> s.getProcessName()));
    int partitionSize = sentences.size() / nbFold;
    int blockSize = 0;
    int currentFoldCnt = 1;
    Thread.sleep(10000);
    System.out.println("Total sentences : " + sentences.size());
    ArrayList<Sentence> trainingData = new ArrayList<Sentence>();
    ArrayList<Sentence> testingData = new ArrayList<Sentence>();
    HashMap<String, String> testProcessName = new HashMap<String, String>();
    HashMap<String, String> trainingProcessName = new HashMap<String, String>();
    for (String testingProcess : processSentPair.keySet()) {
        System.out.println(
                "Process " + testingProcess + " Nb Sentence :" + processSentPair.get(testingProcess).size());
        // if foldNumber is equal to totalFold then
        // keep adding to testData
        if (currentFoldCnt == nbFold) {
            System.out.println("Processing last fold");
            testingData.addAll(processSentPair.get(testingProcess));
            testProcessName.put(testingProcess, testingProcess);
        } // if the block counter still less than partition size AND foldNumber is less than totalFold
          // keep adding to testingData
        else if (blockSize < partitionSize && currentFoldCnt < nbFold) {
            System.out.println("Has not reached the boundary, keep adding testing data");
            blockSize += processSentPair.get(testingProcess).size();
            testingData.addAll(processSentPair.get(testingProcess));
            testProcessName.put(testingProcess, testingProcess);
            System.out.println("BLOCK SIZE : " + blockSize);
        } else {
            System.out.println("Boundary reached, get the training data and flush everything");
            for (String trainingProcess : processSentPair.keySet()) {
                if (testProcessName.get(trainingProcess) == null) {
                    trainingData.addAll(processSentPair.get(trainingProcess));
                    trainingProcessName.put(trainingProcess, trainingProcess);
                }
            }
            System.out.println("Flushing fold " + currentFoldCnt);
            // serialize training & testing processes
            String trainingProcessesStr = Joiner.on("\t").join(trainingProcessName.keySet().iterator());
            String testingProcessessStr = Joiner.on("\t").join(testProcessName.keySet().iterator());
            FileUtil.dumpToFile(trainingProcessesStr,
                    outputDir.concat("/fold-" + currentFoldCnt).concat("/train/train_process_name"));
            FileUtil.dumpToFile(testingProcessessStr,
                    outputDir.concat("/fold-" + currentFoldCnt).concat("/test/test_process_name"));
            System.out.println("Nb Sentence in train" + trainingData.size());
            System.out.println("Nb Sentence in test" + testingData.size());
            FileUtil.serializeToFile(trainingData,
                    outputDir.concat("/fold-" + currentFoldCnt).concat("/train/train.ser"));

            // ==============================================   SEMAFOR ==============================================================================================================================================
            // ============================================================================================================================================================================================
            SpockDataReader.generateSEMAFORFrameAnnotation(trainingData,
                    outputDir.concat("/fold-" + currentFoldCnt)
                            .concat("/train/cv." + currentFoldCnt + ".train.sentences.frame.elements.sbu"),
                    outputDir.concat("/fold-" + currentFoldCnt)
                            .concat("/train/cv." + currentFoldCnt + ".train.sentence.sbu"),
                    semOffset); // DUMP REQUIRED DATA FOR SEMAFOR
            SpockDataReader.dumpRawSentences(testingData, outputDir.concat("/fold-" + currentFoldCnt)
                    .concat("/test/cv." + currentFoldCnt + ".test.sentence.sbu"));
            SpockDataReader.dumpSentenceLexTargetIdxs(testingData, outputDir.concat("/fold-" + currentFoldCnt)
                    .concat("/test/cv." + currentFoldCnt + ".test.process.target"));
            // EXECUTE ./runMalt.sh here
            try {
                ProcessBuilder pb = new ProcessBuilder(MALT_PARSER_PATH,
                        outputDir.concat("/fold-" + currentFoldCnt)
                                .concat("/train/cv." + currentFoldCnt + ".train.sentence.sbu"),
                        outputDir.concat("/fold-" + currentFoldCnt).concat("/train"));
                //pb.environment().put("param1", )
                Process p = pb.start(); // Start the process.
                p.waitFor(); // Wait for the process to finish.
                StdUtil.printOutput(p);
                System.out.println("Script executed successfully");
                AllAnnotationsMergingWithoutNE.mergeAllAnnotations(
                        outputDir.concat("/fold-" + currentFoldCnt).concat("/train/tokenized"),
                        outputDir.concat("/fold-" + currentFoldCnt).concat("/train/conll"),
                        outputDir.concat("/fold-" + currentFoldCnt).concat("/train/tmp"),
                        outputDir.concat("/fold-" + currentFoldCnt)
                                .concat("/train/cv." + currentFoldCnt + ".train.sentences.all.lemma.tags.sbu"));
            } catch (Exception e) {
                e.printStackTrace();
            }
            // ============================================================================================================================================================================================
            // ==============================================   END OF SEMAFOR ==========================================================================================

            FileUtil.serializeToFile(testingData,
                    outputDir.concat("/fold-" + currentFoldCnt).concat("/test/test.arggold.ser"));
            trainingData.clear();
            testingData.clear();
            blockSize = 0;
            currentFoldCnt++;
            testProcessName.clear();
            trainingProcessName.clear();
        }
    }

    // handle for the last fold""
    for (String trainingProcess : processSentPair.keySet()) {
        if (testProcessName.get(trainingProcess) == null) {
            trainingData.addAll(processSentPair.get(trainingProcess));
            trainingProcessName.put(trainingProcess, trainingProcess);
        }
    }
    // serialize training & testing processes
    System.out.println("Flushing fold " + currentFoldCnt);
    String trainingProcessesStr = Joiner.on("\t").join(trainingProcessName.keySet().iterator());
    String testingProcessessStr = Joiner.on("\t").join(testProcessName.keySet().iterator());
    FileUtil.dumpToFile(trainingProcessesStr,
            outputDir.concat("/fold-" + currentFoldCnt).concat("/train/train_process_name"));
    FileUtil.dumpToFile(testingProcessessStr,
            outputDir.concat("/fold-" + currentFoldCnt).concat("/test/test_process_name"));
    System.out.println("Nb Sentence in train" + trainingData.size());
    System.out.println("Nb Sentence in test" + testingData.size());
    FileUtil.serializeToFile(trainingData,
            outputDir.concat("/fold-" + currentFoldCnt).concat("/train/train.ser"));

    // ==============================================   SEMAFOR ==============================================================================================================================================
    // ============================================================================================================================================================================================
    SpockDataReader.generateSEMAFORFrameAnnotation(trainingData,
            outputDir.concat("/fold-" + currentFoldCnt)
                    .concat("/train/cv." + currentFoldCnt + ".train.sentences.frame.elements.sbu"),
            outputDir.concat("/fold-" + currentFoldCnt)
                    .concat("/train/cv." + currentFoldCnt + ".train.sentence.sbu"),
            semOffset); // DUMP REQUIRED DATA FOR SEMAFOR
    SpockDataReader.dumpRawSentences(testingData, outputDir.concat("/fold-" + currentFoldCnt)
            .concat("/test/cv." + currentFoldCnt + ".test.sentence.sbu"));
    SpockDataReader.dumpSentenceLexTargetIdxs(testingData, outputDir.concat("/fold-" + currentFoldCnt)
            .concat("/test/cv." + currentFoldCnt + ".test.process.target"));
    // EXECUTE ./runMalt.sh here
    try {
        ProcessBuilder pb = new ProcessBuilder(MALT_PARSER_PATH,
                outputDir.concat("/fold-" + currentFoldCnt)
                        .concat("/train/cv." + currentFoldCnt + ".train.sentence.sbu"),
                outputDir.concat("/fold-" + currentFoldCnt).concat("/train"));
        //pb.environment().put("param1", )
        Process p = pb.start(); // Start the process.
        p.waitFor(); // Wait for the process to finish.
        StdUtil.printOutput(p);
        System.out.println("Script executed successfully");
        AllAnnotationsMergingWithoutNE.mergeAllAnnotations(
                outputDir.concat("/fold-" + currentFoldCnt).concat("/train/tokenized"),
                outputDir.concat("/fold-" + currentFoldCnt).concat("/train/conll"),
                outputDir.concat("/fold-" + currentFoldCnt).concat("/train/tmp"),
                outputDir.concat("/fold-" + currentFoldCnt)
                        .concat("/train/cv." + currentFoldCnt + ".train.sentences.all.lemma.tags.sbu"));
    } catch (Exception e) {
        e.printStackTrace();
    }
    // ============================================================================================================================================================================================
    // ==============================================   END OF SEMAFOR ==========================================================================================

    FileUtil.serializeToFile(testingData,
            outputDir.concat("/fold-" + currentFoldCnt).concat("/test/test.arggold.ser"));
}

From source file:sbu.srl.rolextract.ArgumentClassifier.java

public void distributeTrainTest()
        throws FileNotFoundException, IOException, InterruptedException, ClassNotFoundException {
    //sentences = (ArrayList<Sentence>) FileUtil.deserializeFromFile("./data/training_4_roles.ser");
    Map<String, List<Sentence>> processSentPair = sentences.stream()
            .collect(Collectors.groupingBy(s -> s.getProcessName()));
    int blockSize = 0;
    int currentFoldCnt = 1;
    Thread.sleep(10000);//ww  w.  j  av  a2s .co m
    System.out.println("Total sentences : " + sentences.size());
    ArrayList<Sentence> trainingData = new ArrayList<Sentence>();
    ArrayList<Sentence> testingData = new ArrayList<Sentence>();
    HashMap<String, String> testProcessName = new HashMap<String, String>();
    HashMap<String, String> trainingProcessName = new HashMap<String, String>();
    for (String trainingProcess : processSentPair.keySet()) {
        if (testProcessName.get(trainingProcess) == null) {
            trainingData.addAll(processSentPair.get(trainingProcess));
            trainingProcessName.put(trainingProcess, trainingProcess);
        }
    }
    // serialize training & testing processes
    String trainingProcessesStr = Joiner.on("\t").join(trainingProcessName.keySet().iterator());
    FileUtil.dumpToFile(trainingProcessesStr,
            outputDir.concat("/fold-" + currentFoldCnt).concat("/train/train_process_name"));
    System.out.println("Nb Sentence in train" + trainingData.size());
    FileUtil.serializeToFile(trainingData,
            outputDir.concat("/fold-" + currentFoldCnt).concat("/train/train.ser"));
    // ==============================================   SEMAFOR ==============================================================================================================================================
    // ============================================================================================================================================================================================
    SpockDataReader.generateSEMAFORFrameAnnotation(trainingData,
            outputDir.concat("/fold-" + currentFoldCnt)
                    .concat("/train/cv." + currentFoldCnt + ".train.sentences.frame.elements.sbu"),
            outputDir.concat("/fold-" + currentFoldCnt)
                    .concat("/train/cv." + currentFoldCnt + ".train.sentence.sbu"),
            semOffset); // DUMP REQUIRED DATA FOR SEMAFOR

    // ============================================== TESTING =======================================================================

    SpockDataReader testDataReader = new SpockDataReader(testingFileName, configFileName, true);
    testDataReader.readData();
    ArrayList<Sentence> testingSentences = testDataReader.getSentences();//= (ArrayList<Sentence>)FileUtil.deserializeFromFile("/home/slouvan/NetBeansProjects/SRL-Integrated/thousand_sentences.ser");
    FileUtil.serializeToFile(testingSentences,
            "/home/slouvan/NetBeansProjects/SRL-Integrated/thousand_sentences.ser");
    Map<String, List<Sentence>> testProcessSentPair = testingSentences.stream()
            .collect(Collectors.groupingBy(s -> s.getProcessName()));
    for (String testingProcess : testProcessSentPair.keySet()) {
        testProcessName.put(testingProcess, testingProcess);
        testingData.addAll(testProcessSentPair.get(testingProcess));
    }
    String testingProcessessStr = Joiner.on("\t").join(testProcessName.keySet().iterator());
    System.out.println("Nb Sentence in test" + testingData.size());

    FileUtil.dumpToFile(testingProcessessStr,
            outputDir.concat("/fold-" + currentFoldCnt).concat("/test/test_process_name"));
    SpockDataReader.dumpRawSentences(testingData, outputDir.concat("/fold-" + currentFoldCnt)
            .concat("/test/cv." + currentFoldCnt + ".test.sentence.sbu"));
    SpockDataReader.dumpSentenceLexTargetIdxs(testingData, outputDir.concat("/fold-" + currentFoldCnt)
            .concat("/test/cv." + currentFoldCnt + ".test.process.target"));
    // EXECUTE ./runMalt.sh here
    try {
        ProcessBuilder pb = new ProcessBuilder(MALT_PARSER_PATH,
                outputDir.concat("/fold-" + currentFoldCnt)
                        .concat("/train/cv." + currentFoldCnt + ".train.sentence.sbu"),
                outputDir.concat("/fold-" + currentFoldCnt).concat("/train"));
        //pb.environment().put("param1", )
        Process p = pb.start(); // Start the process.
        p.waitFor(); // Wait for the process to finish.
        StdUtil.printOutput(p);
        System.out.println("Script executed successfully");
        AllAnnotationsMergingWithoutNE.mergeAllAnnotations(
                outputDir.concat("/fold-" + currentFoldCnt).concat("/train/tokenized"),
                outputDir.concat("/fold-" + currentFoldCnt).concat("/train/conll"),
                outputDir.concat("/fold-" + currentFoldCnt).concat("/train/tmp"),
                outputDir.concat("/fold-" + currentFoldCnt)
                        .concat("/train/cv." + currentFoldCnt + ".train.sentences.all.lemma.tags.sbu"));
    } catch (Exception e) {
        e.printStackTrace();
    }

    // ============================================================================================================================================================================================
    // ==============================================   END OF SEMAFOR ==========================================================================================
    FileUtil.serializeToFile(testingData,
            outputDir.concat("/fold-" + currentFoldCnt).concat("/test/test.arggold.ser"));
}

From source file:sbu.srl.rolextract.ArgumentClassifier.java

public void doTrainClassify(double trainPctg) throws IOException, FileNotFoundException, ClassNotFoundException,
        NoSuchMethodException, IllegalAccessException, IllegalArgumentException, InvocationTargetException {
    setupCrossValidationEnvironment(outputDir, 1);
    Collections.shuffle(sentences, new Random(System.nanoTime()));
    //  /*w ww .j a v a2 s.  co  m*/
    int startIdx = 0;
    int nbTrain = (int) (trainPctg * sentences.size());
    ArrayList<Sentence> trainingData = new ArrayList<>();
    ArrayList<Sentence> testingData = new ArrayList<>();

    trainingData.addAll(sentences.subList(0, nbTrain));
    testingData.addAll(sentences.subList(nbTrain, sentences.size()));

    FileUtil.serializeToFile(trainingData, outputDir.concat("/fold-1").concat("/train/train.ser"));
    FileUtil.serializeToFile(testingData, outputDir.concat("/fold-1").concat("/test/test.arggold.ser"));

    File trainFoldDir = new File(outputDir.concat("/fold-1").concat("/train"));
    File testFoldDir = new File(outputDir.concat("/fold-1").concat("/test"));
    SBURoleTrain trainer = new SBURoleTrain(trainFoldDir.getAbsolutePath().concat("/train.ser"), isMultiClass);
    if (isMultiClass) {
        trainer.trainMultiClassClassifier(trainFoldDir.getAbsolutePath());
    } else {
        trainer.trainBinaryClassifier(trainFoldDir.getAbsolutePath());
    }
    FileUtil.serializeToFile(trainingData, outputDir.concat("/fold-1").concat("/train/train.ser"));
    SBURolePredict predict = new SBURolePredict(trainFoldDir.getAbsolutePath(),
            testFoldDir.getAbsolutePath().concat("/test.arggold.ser"), isMultiClass);
    predict.performPrediction(testFoldDir.getAbsolutePath().concat("/test.arggold.ser"));

    ArrayList<Sentence> predictedSentences = (ArrayList<Sentence>) FileUtil
            .deserializeFromFile(testFoldDir.getAbsolutePath().concat("/test.argpredict.ser"));
    Map<String, List<Sentence>> groupByProcess = predictedSentences.stream()
            .collect(Collectors.groupingBy(Sentence::getProcessName));

    ArrayList<JSONData> jsonData = SentenceUtil.generateJSONData(groupByProcess);
    SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.srlout.json"), false);
    SentenceUtil.flushDataToJSON(jsonData, testFoldDir.getAbsolutePath().concat("/test.srlpredict.json"), true);
}