Example usage for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction

Introduction

In this page you can find the example usage for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction.

Prototype

PairFlatMapFunction

Source Link

Usage

From source file:weka.distributed.spark.RandomizedDataSparkJob.java

License:Open Source License

/**
 * Perform the randomization (and stratification) in the case where the input
 * data does not contain string or relational attributes. In this case, our
 * final RDD can contain instances objects, which will avoid further parsing
 * in subsequent jobs.//w w  w. j a va2s  .com
 *
 * @param input
 * @param outputPath
 * @param numFoldSlices
 * @param random
 * @param headerWithSummary
 * @param classIndex the classIndex to use
 * @throws IOException
 * @throws DistributedWekaException
 */
protected void performRandomShuffle(JavaRDD<Instance> input, String outputPath, final int numFoldSlices,
        final Random random, final Instances headerWithSummary, int classIndex)
        throws IOException, DistributedWekaException {

    final Instances headerNoSummary = CSVToARFFHeaderReduceTask.stripSummaryAtts(headerWithSummary);
    headerNoSummary.setClassIndex(classIndex);

    boolean containsStringOrRelational = headerNoSummary.checkForStringAttributes()
            || headerNoSummary.checkForAttributeType(Attribute.RELATIONAL);

    final PhaseOneRandomization phaseOne = new PhaseOneRandomization(headerNoSummary, getCSVMapTaskOptions(),
            containsStringOrRelational, random, numFoldSlices);

    // Phase 1 - randomly shuffle the data
    logMessage("[Randomly shuffle data] Starting phase 1...");
    JavaPairRDD<Integer, Object> mapResults = input
            .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Instance>, Integer, Object>() {

                /** For serialization */
                private static final long serialVersionUID = -5351850875358513817L;

                protected List<Tuple2<Integer, Object>> m_randomizedRows = new ArrayList<Tuple2<Integer, Object>>();

                // protected CSVToARFFHeaderMapTask m_rowHelper;

                @Override
                public Iterable<Tuple2<Integer, Object>> call(Iterator<Instance> split)
                        throws IOException, DistributedWekaException {

                    while (split.hasNext()) {
                        Instance row = split.next();

                        Tuple2<Integer, Object> processed = phaseOne.process(row);
                        m_randomizedRows.add(processed);
                    }

                    // System.err.println("****** Number in partition: " + m_count);
                    return m_randomizedRows;
                }
            }).persist(getCachingStrategy().getStorageLevel());

    // Now sort into ascending order of random assignment number
    JavaPairRDD<Integer, Object> sortedByAssignment = mapResults.sortByKey(true)
            .partitionBy(new IntegerKeyPartitioner(numFoldSlices))
            .persist(getCachingStrategy().getStorageLevel());
    sortedByAssignment.count();

    // discard mapResults
    mapResults.unpersist();
    mapResults = null;

    // List<Tuple2<Integer, Object>> tmpData = sortedByAssignment.collect();
    // for (Tuple2<Integer, Object> row : tmpData) {
    // ((Instance) row._2()).setDataset(headerNoSummary);
    // System.err.println(row._1() + ": " + row._2().toString());
    // }

    if (headerNoSummary.classIndex() < 0 || headerNoSummary.classAttribute().isNumeric()) {

        // No need for the second phase of dealing classes out to splits
        // if there is no class or a numeric class
        // m_sortedByFold = sortedByAssignment;
        // , true here because we preserve the partitions from sortedByAssignment
        JavaRDD<Instance> finalDataSet = sortedByAssignment
                .mapPartitions(new FlatMapFunction<Iterator<Tuple2<Integer, Object>>, Instance>() {

                    /**
                     * For serialization
                     */
                    private static final long serialVersionUID = -4129157509045217459L;
                    List<Instance> m_list = new ArrayList<Instance>();

                    @Override
                    public Iterable<Instance> call(Iterator<Tuple2<Integer, Object>> split) {
                        while (split.hasNext()) {

                            // make sure each instance has a reference to the header
                            Instance nextI = (Instance) split.next()._2();
                            nextI.setDataset(headerNoSummary);
                            m_list.add(nextI);
                        }

                        return m_list;
                    }
                }, true).persist(getCachingStrategy().getStorageLevel());

        finalDataSet.count(); // materialize this RDD

        logMessage("[Randomly shuffle data] Unpersisting sorted phase 1 RDD");
        sortedByAssignment.unpersist();
        sortedByAssignment = null;

        m_sortedByFold = finalDataSet;

    } else {
        // phase 2 - deal classes out to splits + oversample minority classes
        final PhaseTwoStratification phaseTwo = new PhaseTwoStratification(headerNoSummary, numFoldSlices,
                false);

        logMessage("[Randomly shuffle data] Starting phase 2 (deal to folds/stratification)...");
        JavaPairRDD<Integer, Object> dealtToFolds = sortedByAssignment.mapPartitionsToPair(
                new PairFlatMapFunction<Iterator<Tuple2<Integer, Object>>, Integer, Object>() {

                    /**
                     * For serialization
                     */
                    private static final long serialVersionUID = -5903374381393577497L;

                    protected List<Tuple2<Integer, Object>> m_dealtRows = new ArrayList<Tuple2<Integer, Object>>();

                    @Override
                    public Iterable<Tuple2<Integer, Object>> call(Iterator<Tuple2<Integer, Object>> split) {

                        while (split.hasNext()) {
                            Tuple2<Integer, Object> current = split.next();
                            Tuple2<Integer, Object> result = phaseTwo.process(current._2);

                            m_dealtRows.add(result);
                        }

                        phaseTwo.checkForMinorityClassCases(m_dealtRows);

                        return m_dealtRows;
                    }
                }).persist(getCachingStrategy().getStorageLevel());

        // discard sortedByAssignment

        logMessage("[Randomly shuffle data] Repartitioning phase 2 RDD according to fold number");
        JavaPairRDD<Integer, Object> tmpSortedByFold = dealtToFolds.sortByKey()
                .partitionBy(new IntegerKeyPartitioner(numFoldSlices))
                .persist(getCachingStrategy().getStorageLevel());

        tmpSortedByFold.count();

        sortedByAssignment.unpersist();
        sortedByAssignment = null;
        dealtToFolds.unpersist();
        dealtToFolds = null;

        // writeRandomizedSplits(outputPath, dealtToFolds);

        // List<Tuple2<Integer, Object>> tmpData = dealtToFolds.collect();
        // for (Tuple2<Integer, Object> row : tmpData) {
        // ((Instance) row._2()).setDataset(headerNoSummary);
        // System.err.println(row._1() + ": " + row._2().toString());
        // }

        // m_sortedByFold = dealtToFolds.sortByKey(true);
        logMessage("[Randomly shuffle data] Creating and persisting final dataset (RDD<Instance>)...");
        JavaRDD<Instance> finalDataSet = null;

        if (!containsStringOrRelational) {
            finalDataSet = tmpSortedByFold
                    .mapPartitions(new FlatMapFunction<Iterator<Tuple2<Integer, Object>>, Instance>() {

                        /**
                         * For serialization
                         */
                        private static final long serialVersionUID = 5425826829981136102L;
                        List<Instance> m_list = new ArrayList<Instance>();

                        @Override
                        public Iterable<Instance> call(Iterator<Tuple2<Integer, Object>> split) {
                            while (split.hasNext()) {

                                // make sure that each instance has access to the header
                                Instance nextI = (Instance) split.next()._2();
                                nextI.setDataset(headerNoSummary);
                                m_list.add(nextI);
                            }

                            return m_list;
                        }
                    }, true).persist(getCachingStrategy().getStorageLevel());
        } else {
            CSVToInstancePairFlatMapFunction instanceFunction = new CSVToInstancePairFlatMapFunction(
                    headerNoSummary, getCSVMapTaskOptions());

            // , true here because we preserve the partitions from tmpSortedByFold
            finalDataSet = tmpSortedByFold.mapPartitions(instanceFunction, true)
                    .persist(getCachingStrategy().getStorageLevel());
        }

        logMessage("[Randomly shuffle data] forcing materialization of final shuffled data");
        finalDataSet.count();

        logMessage("[Randomly shuffle data] Unpersisting intermediate RDDs");

        tmpSortedByFold.unpersist();
        tmpSortedByFold = null;

        m_sortedByFold = finalDataSet;
        logMessage("[Randomly shuffle data] Finished shuffling/stratifying RDD. Number of partitions: "
                + m_sortedByFold.partitions().size());
    }

    setDataset(TRAINING_DATA, new Dataset<Instance>(m_sortedByFold, headerWithSummary));

    if (m_writeRandomizedDataToOutput) {
        writeRandomizedSplits(outputPath, m_sortedByFold);
    }
}

From source file:weka.distributed.spark.WekaAttributeSelectionSparkJob.java

License:Open Source License

protected Map<BitSet, Classifier[]> phaseOneBuildClassifiers(JavaPairRDD<BitSet, Iterable<Instance>> dataset,
        BitSet[] subsetList, final Instances headerNoSummary) throws Exception {

    int totalFolds = 1;
    final String classifierMapTaskOptions = environmentSubstitute(
            m_classifierJob.getClassifierMapTaskOptions());
    String[] cOpts = Utils.splitOptions(classifierMapTaskOptions);
    String numFolds = Utils.getOption("total-folds", cOpts.clone());
    final boolean forceVote = Utils.getFlag("force-vote", cOpts.clone());
    if (!DistributedJobConfig.isEmpty(numFolds)) {
        totalFolds = Integer.parseInt(numFolds);
    }//from  w w w .j av  a2  s  .  c  om
    final int tFolds = totalFolds;

    final Map<BitSet, Classifier[]> foldClassifiers = new HashMap<BitSet, Classifier[]>();
    for (BitSet subset : subsetList) {
        foldClassifiers.put(subset, new Classifier[totalFolds]);
    }

    // just use headerNoSummary for class index
    final int classIndex = headerNoSummary.classIndex();
    final int numPartitions = dataset.partitions().size();

    int numIterations = m_classifierJob.getNumIterations();

    final int numSplits = dataset.partitions().size();

    for (int i = 0; i < numIterations; i++) {
        final int iterationNum = i;
        logMessage("[WekaClassifierEvaluation] Phase 1 (map), iteration " + (i + 1));

        JavaPairRDD<Tuple2<BitSet, Integer>, Classifier> mapFolds = dataset.flatMapToPair(
                new PairFlatMapFunction<Tuple2<BitSet, Iterable<Instance>>, Tuple2<BitSet, Integer>, Classifier>() {

                    /** For serialization */
                    private static final long serialVersionUID = -1906414304952140395L;

                    protected Instances m_header;

                    /** Holds results */
                    protected List<Tuple2<Tuple2<BitSet, Integer>, Classifier>> m_classifiersForFolds = new ArrayList<Tuple2<Tuple2<BitSet, Integer>, Classifier>>();

                    //
                    //         @Override
                    //         public Tuple2<Integer, Classifier> call(
                    //               Tuple2<PreconstructedFilter, Iterable<Instance>> arg0)
                    //               throws Exception {
                    //            // TODO Auto-generated method stub
                    //            return null;
                    //         }

                    @Override
                    public Iterable<Tuple2<Tuple2<BitSet, Integer>, Classifier>> call(
                            Tuple2<BitSet, Iterable<Instance>> arg0)
                            throws IOException, DistributedWekaException {

                        PreconstructedFilter preconstructedFilter = GetFilterFromBitSet(arg0._1(),
                                headerNoSummary);
                        Iterator<Instance> split = arg0._2().iterator();

                        Instance current = split.next();
                        if (current == null) {
                            throw new IOException("No data in this partition!!");
                        }

                        m_header = current.dataset();
                        m_header.setClassIndex(classIndex);
                        // WekaClassifierMapTask tempTask = new WekaClassifierMapTask();
                        // try {
                        // WekaClassifierSparkJob.configureClassifierMapTask(tempTask,
                        // null, classifierMapTaskOptions, iterationNum,
                        // preconstructedFilter, numSplits);
                        // } catch (Exception ex) {
                        // throw new DistributedWekaException(ex);
                        // }
                        //
                        // boolean isUpdateableClassifier = tempTask.getClassifier()
                        // instanceof UpdateableClassifier;
                        // boolean forceBatchForUpdateable =
                        // tempTask.getForceBatchLearningForUpdateableClassifiers();

                        WekaClassifierMapTask[] tasks = new WekaClassifierMapTask[tFolds];
                        for (int j = 0; j < tFolds; j++) {
                            try {
                                tasks[j] = new WekaClassifierMapTask();
                                WekaClassifierSparkJob.configureClassifierMapTask(tasks[j],
                                        foldClassifiers.get(arg0._1())[j], classifierMapTaskOptions,
                                        iterationNum, preconstructedFilter, numSplits);

                                // set fold number and total folds
                                tasks[j].setFoldNumber(j + 1);
                                tasks[j].setTotalNumFolds(tFolds);
                                Environment env = new Environment();
                                env.addVariable(WekaClassifierMapTask.TOTAL_NUMBER_OF_MAPS, "" + numPartitions);
                                tasks[j].setEnvironment(env);
                            } catch (Exception ex) {
                                logMessage(ex);
                                throw new DistributedWekaException(ex);
                            }

                            // initialize
                            tasks[j].setup(headerNoSummary);
                        }

                        while (split.hasNext()) {
                            current = split.next();

                            for (int j = 0; j < tFolds; j++) {
                                tasks[j].processInstance(current);
                            }
                        }

                        for (int j = 0; j < tFolds; j++) {
                            tasks[j].finalizeTask();
                            m_classifiersForFolds.add(new Tuple2<Tuple2<BitSet, Integer>, Classifier>(
                                    new Tuple2<BitSet, Integer>(arg0._1(), j), tasks[j].getClassifier()));
                        }

                        return m_classifiersForFolds;
                    }

                });
        mapFolds = mapFolds.persist(StorageLevel.MEMORY_AND_DISK());
        // memory and disk here for fast access and to avoid
        // recomputing partial classifiers if all partial classifiers
        // can't fit in memory

        // reduce fold models
        logMessage("[WekaClassifierEvaluation] Phase 1 (reduce), iteration " + (i + 1));
        JavaPairRDD<Tuple2<BitSet, Integer>, Classifier> reducedByFold = mapFolds.groupByKey().mapToPair(
                new PairFunction<Tuple2<Tuple2<BitSet, Integer>, Iterable<Classifier>>, Tuple2<BitSet, Integer>, Classifier>() {
                    /** For serialization */
                    private static final long serialVersionUID = 2481672301097842496L;

                    @Override
                    public Tuple2<Tuple2<BitSet, Integer>, Classifier> call(
                            Tuple2<Tuple2<BitSet, Integer>, Iterable<Classifier>> arg0)
                            throws Exception, DistributedWekaException {

                        Iterator<Classifier> split = arg0._2().iterator();
                        //            
                        //              int foldNum = -1;
                        //
                        List<Classifier> classifiers = new ArrayList<Classifier>();

                        while (split.hasNext()) {
                            classifiers.add(split.next());
                        }
                        //                Tuple2<Integer, Classifier> partial = split.next();
                        //                if (foldNum < 0) {
                        //                  foldNum = partial._1().intValue();
                        //                } else {
                        //                  if (partial._1().intValue() != foldNum) {
                        //                    throw new DistributedWekaException(
                        //                      "[WekaClassifierEvaluation] build "
                        //                        + "classifiers reduce phase: was not expecting fold number "
                        //                        + "to change within a partition!");
                        //                  }
                        //                }
                        //                classifiers.add(partial._2());
                        //              }

                        WekaClassifierReduceTask reduceTask = new WekaClassifierReduceTask();
                        Classifier intermediateClassifier = reduceTask.aggregate(classifiers, null, forceVote);

                        return new Tuple2<Tuple2<BitSet, Integer>, Classifier>(arg0._1(),
                                intermediateClassifier);
                    }

                });

        List<Tuple2<Tuple2<BitSet, Integer>, Classifier>> aggregated = reducedByFold.collect();
        for (Tuple2<Tuple2<BitSet, Integer>, Classifier> t : aggregated) {
            // this makes my head hurts!
            foldClassifiers.get(t._1()._1())[t._1()._2()] = t._2();
        }

        mapFolds.unpersist();
        reducedByFold.unpersist();
    }

    return foldClassifiers;
}

From source file:weka.distributed.spark.WekaAttributeSelectionSparkJob.java

License:Open Source License

protected Tuple2<Double, BitSet> EvaluateSubset(BitSet[] subsetList, JavaRDD<Instance> dataSet,
        Instances headerNoSummary, Instances headerWithSummary) throws Exception {
    //     // this is for performance evaluation
    //     int index = 0;
    //     for (BitSet subset : subsetList)
    //     {/*w w w.j a va  2s.c o  m*/
    //        subset.clear();
    //        subset.flip(0,20);
    //        subset.flip(index++);
    //     }

    logMessage("Evaluate Subsets: ");
    for (BitSet n : subsetList) {
        logMessage(n.toString());
    }

    final BitSet[] finalSubsets = subsetList;

    JavaPairRDD<BitSet, Instance> bitsetInstanceData = dataSet
            .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Instance>, BitSet, Instance>() {

                /**
                 * 
                 */
                private static final long serialVersionUID = -7672702819274482635L;

                @Override
                public Iterable<Tuple2<BitSet, Instance>> call(Iterator<Instance> split) throws Exception {

                    List<Tuple2<BitSet, Instance>> returnValue = new ArrayList<Tuple2<BitSet, Instance>>();

                    while (split.hasNext()) {
                        Instance current = split.next();

                        for (BitSet s : finalSubsets) {
                            returnValue.add(new Tuple2<BitSet, Instance>(s, current));
                        }
                    }

                    return returnValue;
                }

            }, true);

    JavaPairRDD<BitSet, Iterable<Instance>> groupedData = bitsetInstanceData.groupByKey();

    Map<BitSet, Classifier[]> foldClassifiers = phaseOneBuildClassifiers(groupedData, subsetList,
            headerNoSummary);

    logMessage("Phase 1 done");
    for (Map.Entry<BitSet, Classifier[]> entry : foldClassifiers.entrySet()) {
        logMessage("Bitset: " + entry.getKey().toString() + " Classifiers: " + entry.getValue().length);
    }

    List<Tuple2<BitSet, Evaluation>> results = phaseTwoEvaluateClassifiers(groupedData, headerWithSummary,
            headerNoSummary, foldClassifiers);

    // get best result
    Double bestEval = Double.NEGATIVE_INFINITY;
    BitSet bestSubset = null;

    logMessage("Result count = " + results.size());

    for (Tuple2<BitSet, Evaluation> result : results) {
        Double eval = Double.MIN_VALUE;
        if (headerNoSummary.classAttribute().isNominal()) {
            eval = -result._2().errorRate();
        } else {
            eval = -result._2().meanAbsoluteError();
        }

        if (eval.isNaN())
            eval = (double) -100000000;

        logMessage("Result = " + result._1().toString() + " ==> " + eval);

        if (eval > bestEval) {
            bestEval = eval;
            bestSubset = result._1();
        }
    }

    logMessage("Best Result = " + bestSubset.toString() + " ==> " + bestEval);

    return new Tuple2<Double, BitSet>(bestEval, bestSubset);
}

From source file:weka.distributed.spark.WekaClassifierEvaluationSparkJob.java

License:Open Source License

protected Classifier[] phaseOneBuildClassifiers(JavaRDD<Instance> dataset, final Instances headerNoSummary,
        final PreconstructedFilter preconstructedFilter) throws Exception {

    int totalFolds = 1;
    final String classifierMapTaskOptions = environmentSubstitute(
            m_classifierJob.getClassifierMapTaskOptions());
    String[] cOpts = Utils.splitOptions(classifierMapTaskOptions);
    String numFolds = Utils.getOption("total-folds", cOpts.clone());
    final boolean forceVote = Utils.getFlag("force-vote", cOpts.clone());
    if (!DistributedJobConfig.isEmpty(numFolds)) {
        totalFolds = Integer.parseInt(numFolds);
    }//w w w . j a va2  s  .  c om
    final int tFolds = totalFolds;

    final Classifier[] foldClassifiers = new Classifier[totalFolds];

    // just use headerNoSummary for class index
    final int classIndex = headerNoSummary.classIndex();
    final int numPartitions = dataset.partitions().size();

    int numIterations = m_classifierJob.getNumIterations();

    final int numSplits = dataset.partitions().size();

    for (int i = 0; i < numIterations; i++) {
        final int iterationNum = i;
        logMessage("[WekaClassifierEvaluation] Phase 1 (map), iteration " + (i + 1));

        JavaPairRDD<Integer, Classifier> mapFolds = dataset
                .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Instance>, Integer, Classifier>() {

                    /** For serialization */
                    private static final long serialVersionUID = -1906414304952140395L;

                    protected Instances m_header;

                    /** Holds results */
                    protected List<Tuple2<Integer, Classifier>> m_classifiersForFolds = new ArrayList<Tuple2<Integer, Classifier>>();

                    @Override
                    public Iterable<Tuple2<Integer, Classifier>> call(Iterator<Instance> split)
                            throws IOException, DistributedWekaException {

                        Instance current = split.next();
                        if (current == null) {
                            throw new IOException("No data in this partition!!");
                        }

                        m_header = current.dataset();
                        m_header.setClassIndex(classIndex);
                        // WekaClassifierMapTask tempTask = new WekaClassifierMapTask();
                        // try {
                        // WekaClassifierSparkJob.configureClassifierMapTask(tempTask,
                        // null, classifierMapTaskOptions, iterationNum,
                        // preconstructedFilter, numSplits);
                        // } catch (Exception ex) {
                        // throw new DistributedWekaException(ex);
                        // }
                        //
                        // boolean isUpdateableClassifier = tempTask.getClassifier()
                        // instanceof UpdateableClassifier;
                        // boolean forceBatchForUpdateable =
                        // tempTask.getForceBatchLearningForUpdateableClassifiers();

                        WekaClassifierMapTask[] tasks = new WekaClassifierMapTask[tFolds];
                        for (int j = 0; j < tFolds; j++) {
                            try {
                                tasks[j] = new WekaClassifierMapTask();
                                WekaClassifierSparkJob.configureClassifierMapTask(tasks[j], foldClassifiers[j],
                                        classifierMapTaskOptions, iterationNum, preconstructedFilter,
                                        numSplits);

                                // set fold number and total folds
                                tasks[j].setFoldNumber(j + 1);
                                tasks[j].setTotalNumFolds(tFolds);
                                Environment env = new Environment();
                                env.addVariable(WekaClassifierMapTask.TOTAL_NUMBER_OF_MAPS, "" + numPartitions);
                                tasks[j].setEnvironment(env);
                            } catch (Exception ex) {
                                logMessage(ex);
                                throw new DistributedWekaException(ex);
                            }

                            // initialize
                            tasks[j].setup(headerNoSummary);
                        }

                        while (split.hasNext()) {
                            current = split.next();

                            for (int j = 0; j < tFolds; j++) {
                                tasks[j].processInstance(current);
                            }
                        }

                        for (int j = 0; j < tFolds; j++) {
                            tasks[j].finalizeTask();
                            m_classifiersForFolds
                                    .add(new Tuple2<Integer, Classifier>(j, tasks[j].getClassifier()));
                        }

                        return m_classifiersForFolds;
                    }
                });
        mapFolds = mapFolds.persist(StorageLevel.MEMORY_AND_DISK());
        JavaPairRDD<Integer, Classifier> mapFoldsSorted = mapFolds.sortByKey();// .persist(StorageLevel.MEMORY_AND_DISK());
        mapFoldsSorted = mapFoldsSorted.partitionBy(new IntegerKeyPartitioner(totalFolds))
                .persist(StorageLevel.MEMORY_AND_DISK());

        // memory and disk here for fast access and to avoid
        // recomputing partial classifiers if all partial classifiers
        // can't fit in memory

        // reduce fold models
        logMessage("[WekaClassifierEvaluation] Phase 1 (reduce), iteration " + (i + 1));
        JavaPairRDD<Integer, Classifier> reducedByFold = mapFoldsSorted.mapPartitionsToPair(
                new PairFlatMapFunction<Iterator<Tuple2<Integer, Classifier>>, Integer, Classifier>() {

                    /** For serialization */
                    private static final long serialVersionUID = 2481672301097842496L;

                    /** Holds reduced classifier for one fold (partition) */
                    protected List<Tuple2<Integer, Classifier>> m_reducedForFold = new ArrayList<Tuple2<Integer, Classifier>>();

                    @Override
                    public Iterable<Tuple2<Integer, Classifier>> call(
                            Iterator<Tuple2<Integer, Classifier>> split) throws DistributedWekaException {

                        int foldNum = -1;

                        List<Classifier> classifiers = new ArrayList<Classifier>();
                        while (split.hasNext()) {
                            Tuple2<Integer, Classifier> partial = split.next();
                            if (foldNum < 0) {
                                foldNum = partial._1().intValue();
                            } else {
                                if (partial._1().intValue() != foldNum) {
                                    throw new DistributedWekaException("[WekaClassifierEvaluation] build "
                                            + "classifiers reduce phase: was not expecting fold number "
                                            + "to change within a partition!");
                                }
                            }
                            classifiers.add(partial._2());
                        }

                        WekaClassifierReduceTask reduceTask = new WekaClassifierReduceTask();
                        Classifier intermediateClassifier = reduceTask.aggregate(classifiers, null, forceVote);

                        m_reducedForFold.add(new Tuple2<Integer, Classifier>(foldNum, intermediateClassifier));

                        return m_reducedForFold;
                    }
                });

        List<Tuple2<Integer, Classifier>> aggregated = reducedByFold.collect();
        for (Tuple2<Integer, Classifier> t : aggregated) {
            foldClassifiers[t._1()] = t._2();
        }

        mapFolds.unpersist();
        mapFoldsSorted.unpersist();
        reducedByFold.unpersist();
    }

    return foldClassifiers;
}

From source file:wordcount.Spark_WordCountEachLine.java

License:Apache License

public static void main(String[] args) {

    if (args.length != 3)
        throw new IllegalArgumentException(
                "Usage: " + Spark_WordCountEachLine.class.getName() + " <inputDir> <outputDir> <numCores>");

    long startTime = System.currentTimeMillis();

    String inputDir = args[0];//from www.  j  a  v a2  s.  c o m
    String outputDir = args[1];
    int numCores = Integer.parseInt(args[2]);

    SparkConf conf = new SparkConf().setAppName("Spark word count");
    conf.setMaster("local[" + numCores + "]");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> textLines = sc.textFile(inputDir + "/*");
    List<scala.Tuple2<String, Integer>> res = textLines
            .flatMapToPair(new PairFlatMapFunction<String, String, Integer>() {
                @Override
                public Iterable<scala.Tuple2<String, Integer>> call(String line) throws Exception {
                    if (line.isEmpty())
                        return new ArrayList<scala.Tuple2<String, Integer>>();
                    if (line.startsWith("<doc") || line.startsWith("</doc"))
                        return new ArrayList<scala.Tuple2<String, Integer>>();

                    ArrayList<scala.Tuple2<String, Integer>> listValues = new ArrayList<scala.Tuple2<String, Integer>>();
                    String[] a = pattern.split(line);
                    for (int i = 0; i < a.length; i++) {
                        String w = a[i];
                        if (w.isEmpty())
                            continue;
                        w = w.toLowerCase();
                        scala.Tuple2<String, Integer> t = new scala.Tuple2<String, Integer>(w, 1);
                        listValues.add(t);
                    }
                    return listValues;
                }
            }).reduceByKey(new Function2<Integer, Integer, Integer>() {
                @Override
                public Integer call(Integer val1, Integer val2) throws Exception {
                    return val1 + val2;
                }
            }).collect();

    // Write results.
    StringBuilder sb = new StringBuilder();
    Iterator<scala.Tuple2<String, Integer>> keys = res.iterator();
    while (keys.hasNext()) {
        scala.Tuple2<String, Integer> tuple = keys.next();
        String k = tuple._1();
        int v = tuple._2();
        sb.append("Word: " + k + " Occurrences: " + v + "\n");
    }
    writeTextFile(outputDir + "/results.txt", sb.toString());

    long endTime = System.currentTimeMillis();
    System.out.println("Done! Execution time: " + (endTime - startTime) + " milliseconds.");
}