Example usage for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction

List of usage examples for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction

Introduction

In this page you can find the example usage for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction.

Prototype

PairFlatMapFunction

Source Link

Usage

From source file:weka.distributed.spark.RandomizedDataSparkJob.java

License:Open Source License

/**
 * Perform the randomization (and stratification) in the case where the input
 * data does not contain string or relational attributes. In this case, our
 * final RDD can contain instances objects, which will avoid further parsing
 * in subsequent jobs.//w w  w. j a va2s  .com
 *
 * @param input
 * @param outputPath
 * @param numFoldSlices
 * @param random
 * @param headerWithSummary
 * @param classIndex the classIndex to use
 * @throws IOException
 * @throws DistributedWekaException
 */
protected void performRandomShuffle(JavaRDD<Instance> input, String outputPath, final int numFoldSlices,
        final Random random, final Instances headerWithSummary, int classIndex)
        throws IOException, DistributedWekaException {

    final Instances headerNoSummary = CSVToARFFHeaderReduceTask.stripSummaryAtts(headerWithSummary);
    headerNoSummary.setClassIndex(classIndex);

    boolean containsStringOrRelational = headerNoSummary.checkForStringAttributes()
            || headerNoSummary.checkForAttributeType(Attribute.RELATIONAL);

    final PhaseOneRandomization phaseOne = new PhaseOneRandomization(headerNoSummary, getCSVMapTaskOptions(),
            containsStringOrRelational, random, numFoldSlices);

    // Phase 1 - randomly shuffle the data
    logMessage("[Randomly shuffle data] Starting phase 1...");
    JavaPairRDD<Integer, Object> mapResults = input
            .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Instance>, Integer, Object>() {

                /** For serialization */
                private static final long serialVersionUID = -5351850875358513817L;

                protected List<Tuple2<Integer, Object>> m_randomizedRows = new ArrayList<Tuple2<Integer, Object>>();

                // protected CSVToARFFHeaderMapTask m_rowHelper;

                @Override
                public Iterable<Tuple2<Integer, Object>> call(Iterator<Instance> split)
                        throws IOException, DistributedWekaException {

                    while (split.hasNext()) {
                        Instance row = split.next();

                        Tuple2<Integer, Object> processed = phaseOne.process(row);
                        m_randomizedRows.add(processed);
                    }

                    // System.err.println("****** Number in partition: " + m_count);
                    return m_randomizedRows;
                }
            }).persist(getCachingStrategy().getStorageLevel());

    // Now sort into ascending order of random assignment number
    JavaPairRDD<Integer, Object> sortedByAssignment = mapResults.sortByKey(true)
            .partitionBy(new IntegerKeyPartitioner(numFoldSlices))
            .persist(getCachingStrategy().getStorageLevel());
    sortedByAssignment.count();

    // discard mapResults
    mapResults.unpersist();
    mapResults = null;

    // List<Tuple2<Integer, Object>> tmpData = sortedByAssignment.collect();
    // for (Tuple2<Integer, Object> row : tmpData) {
    // ((Instance) row._2()).setDataset(headerNoSummary);
    // System.err.println(row._1() + ": " + row._2().toString());
    // }

    if (headerNoSummary.classIndex() < 0 || headerNoSummary.classAttribute().isNumeric()) {

        // No need for the second phase of dealing classes out to splits
        // if there is no class or a numeric class
        // m_sortedByFold = sortedByAssignment;
        // , true here because we preserve the partitions from sortedByAssignment
        JavaRDD<Instance> finalDataSet = sortedByAssignment
                .mapPartitions(new FlatMapFunction<Iterator<Tuple2<Integer, Object>>, Instance>() {

                    /**
                     * For serialization
                     */
                    private static final long serialVersionUID = -4129157509045217459L;
                    List<Instance> m_list = new ArrayList<Instance>();

                    @Override
                    public Iterable<Instance> call(Iterator<Tuple2<Integer, Object>> split) {
                        while (split.hasNext()) {

                            // make sure each instance has a reference to the header
                            Instance nextI = (Instance) split.next()._2();
                            nextI.setDataset(headerNoSummary);
                            m_list.add(nextI);
                        }

                        return m_list;
                    }
                }, true).persist(getCachingStrategy().getStorageLevel());

        finalDataSet.count(); // materialize this RDD

        logMessage("[Randomly shuffle data] Unpersisting sorted phase 1 RDD");
        sortedByAssignment.unpersist();
        sortedByAssignment = null;

        m_sortedByFold = finalDataSet;

    } else {
        // phase 2 - deal classes out to splits + oversample minority classes
        final PhaseTwoStratification phaseTwo = new PhaseTwoStratification(headerNoSummary, numFoldSlices,
                false);

        logMessage("[Randomly shuffle data] Starting phase 2 (deal to folds/stratification)...");
        JavaPairRDD<Integer, Object> dealtToFolds = sortedByAssignment.mapPartitionsToPair(
                new PairFlatMapFunction<Iterator<Tuple2<Integer, Object>>, Integer, Object>() {

                    /**
                     * For serialization
                     */
                    private static final long serialVersionUID = -5903374381393577497L;

                    protected List<Tuple2<Integer, Object>> m_dealtRows = new ArrayList<Tuple2<Integer, Object>>();

                    @Override
                    public Iterable<Tuple2<Integer, Object>> call(Iterator<Tuple2<Integer, Object>> split) {

                        while (split.hasNext()) {
                            Tuple2<Integer, Object> current = split.next();
                            Tuple2<Integer, Object> result = phaseTwo.process(current._2);

                            m_dealtRows.add(result);
                        }

                        phaseTwo.checkForMinorityClassCases(m_dealtRows);

                        return m_dealtRows;
                    }
                }).persist(getCachingStrategy().getStorageLevel());

        // discard sortedByAssignment

        logMessage("[Randomly shuffle data] Repartitioning phase 2 RDD according to fold number");
        JavaPairRDD<Integer, Object> tmpSortedByFold = dealtToFolds.sortByKey()
                .partitionBy(new IntegerKeyPartitioner(numFoldSlices))
                .persist(getCachingStrategy().getStorageLevel());

        tmpSortedByFold.count();

        sortedByAssignment.unpersist();
        sortedByAssignment = null;
        dealtToFolds.unpersist();
        dealtToFolds = null;

        // writeRandomizedSplits(outputPath, dealtToFolds);

        // List<Tuple2<Integer, Object>> tmpData = dealtToFolds.collect();
        // for (Tuple2<Integer, Object> row : tmpData) {
        // ((Instance) row._2()).setDataset(headerNoSummary);
        // System.err.println(row._1() + ": " + row._2().toString());
        // }

        // m_sortedByFold = dealtToFolds.sortByKey(true);
        logMessage("[Randomly shuffle data] Creating and persisting final dataset (RDD<Instance>)...");
        JavaRDD<Instance> finalDataSet = null;

        if (!containsStringOrRelational) {
            finalDataSet = tmpSortedByFold
                    .mapPartitions(new FlatMapFunction<Iterator<Tuple2<Integer, Object>>, Instance>() {

                        /**
                         * For serialization
                         */
                        private static final long serialVersionUID = 5425826829981136102L;
                        List<Instance> m_list = new ArrayList<Instance>();

                        @Override
                        public Iterable<Instance> call(Iterator<Tuple2<Integer, Object>> split) {
                            while (split.hasNext()) {

                                // make sure that each instance has access to the header
                                Instance nextI = (Instance) split.next()._2();
                                nextI.setDataset(headerNoSummary);
                                m_list.add(nextI);
                            }

                            return m_list;
                        }
                    }, true).persist(getCachingStrategy().getStorageLevel());
        } else {
            CSVToInstancePairFlatMapFunction instanceFunction = new CSVToInstancePairFlatMapFunction(
                    headerNoSummary, getCSVMapTaskOptions());

            // , true here because we preserve the partitions from tmpSortedByFold
            finalDataSet = tmpSortedByFold.mapPartitions(instanceFunction, true)
                    .persist(getCachingStrategy().getStorageLevel());
        }

        logMessage("[Randomly shuffle data] forcing materialization of final shuffled data");
        finalDataSet.count();

        logMessage("[Randomly shuffle data] Unpersisting intermediate RDDs");

        tmpSortedByFold.unpersist();
        tmpSortedByFold = null;

        m_sortedByFold = finalDataSet;
        logMessage("[Randomly shuffle data] Finished shuffling/stratifying RDD. Number of partitions: "
                + m_sortedByFold.partitions().size());
    }

    setDataset(TRAINING_DATA, new Dataset<Instance>(m_sortedByFold, headerWithSummary));

    if (m_writeRandomizedDataToOutput) {
        writeRandomizedSplits(outputPath, m_sortedByFold);
    }
}

From source file:weka.distributed.spark.WekaAttributeSelectionSparkJob.java

License:Open Source License

protected Map<BitSet, Classifier[]> phaseOneBuildClassifiers(JavaPairRDD<BitSet, Iterable<Instance>> dataset,
        BitSet[] subsetList, final Instances headerNoSummary) throws Exception {

    int totalFolds = 1;
    final String classifierMapTaskOptions = environmentSubstitute(
            m_classifierJob.getClassifierMapTaskOptions());
    String[] cOpts = Utils.splitOptions(classifierMapTaskOptions);
    String numFolds = Utils.getOption("total-folds", cOpts.clone());
    final boolean forceVote = Utils.getFlag("force-vote", cOpts.clone());
    if (!DistributedJobConfig.isEmpty(numFolds)) {
        totalFolds = Integer.parseInt(numFolds);
    }//from  w w w .j av  a2  s  .  c  om
    final int tFolds = totalFolds;

    final Map<BitSet, Classifier[]> foldClassifiers = new HashMap<BitSet, Classifier[]>();
    for (BitSet subset : subsetList) {
        foldClassifiers.put(subset, new Classifier[totalFolds]);
    }

    // just use headerNoSummary for class index
    final int classIndex = headerNoSummary.classIndex();
    final int numPartitions = dataset.partitions().size();

    int numIterations = m_classifierJob.getNumIterations();

    final int numSplits = dataset.partitions().size();

    for (int i = 0; i < numIterations; i++) {
        final int iterationNum = i;
        logMessage("[WekaClassifierEvaluation] Phase 1 (map), iteration " + (i + 1));

        JavaPairRDD<Tuple2<BitSet, Integer>, Classifier> mapFolds = dataset.flatMapToPair(
                new PairFlatMapFunction<Tuple2<BitSet, Iterable<Instance>>, Tuple2<BitSet, Integer>, Classifier>() {

                    /** For serialization */
                    private static final long serialVersionUID = -1906414304952140395L;

                    protected Instances m_header;

                    /** Holds results */
                    protected List<Tuple2<Tuple2<BitSet, Integer>, Classifier>> m_classifiersForFolds = new ArrayList<Tuple2<Tuple2<BitSet, Integer>, Classifier>>();

                    //
                    //         @Override
                    //         public Tuple2<Integer, Classifier> call(
                    //               Tuple2<PreconstructedFilter, Iterable<Instance>> arg0)
                    //               throws Exception {
                    //            // TODO Auto-generated method stub
                    //            return null;
                    //         }

                    @Override
                    public Iterable<Tuple2<Tuple2<BitSet, Integer>, Classifier>> call(
                            Tuple2<BitSet, Iterable<Instance>> arg0)
                            throws IOException, DistributedWekaException {

                        PreconstructedFilter preconstructedFilter = GetFilterFromBitSet(arg0._1(),
                                headerNoSummary);
                        Iterator<Instance> split = arg0._2().iterator();

                        Instance current = split.next();
                        if (current == null) {
                            throw new IOException("No data in this partition!!");
                        }

                        m_header = current.dataset();
                        m_header.setClassIndex(classIndex);
                        // WekaClassifierMapTask tempTask = new WekaClassifierMapTask();
                        // try {
                        // WekaClassifierSparkJob.configureClassifierMapTask(tempTask,
                        // null, classifierMapTaskOptions, iterationNum,
                        // preconstructedFilter, numSplits);
                        // } catch (Exception ex) {
                        // throw new DistributedWekaException(ex);
                        // }
                        //
                        // boolean isUpdateableClassifier = tempTask.getClassifier()
                        // instanceof UpdateableClassifier;
                        // boolean forceBatchForUpdateable =
                        // tempTask.getForceBatchLearningForUpdateableClassifiers();

                        WekaClassifierMapTask[] tasks = new WekaClassifierMapTask[tFolds];
                        for (int j = 0; j < tFolds; j++) {
                            try {
                                tasks[j] = new WekaClassifierMapTask();
                                WekaClassifierSparkJob.configureClassifierMapTask(tasks[j],
                                        foldClassifiers.get(arg0._1())[j], classifierMapTaskOptions,
                                        iterationNum, preconstructedFilter, numSplits);

                                // set fold number and total folds
                                tasks[j].setFoldNumber(j + 1);
                                tasks[j].setTotalNumFolds(tFolds);
                                Environment env = new Environment();
                                env.addVariable(WekaClassifierMapTask.TOTAL_NUMBER_OF_MAPS, "" + numPartitions);
                                tasks[j].setEnvironment(env);
                            } catch (Exception ex) {
                                logMessage(ex);
                                throw new DistributedWekaException(ex);
                            }

                            // initialize
                            tasks[j].setup(headerNoSummary);
                        }

                        while (split.hasNext()) {
                            current = split.next();

                            for (int j = 0; j < tFolds; j++) {
                                tasks[j].processInstance(current);
                            }
                        }

                        for (int j = 0; j < tFolds; j++) {
                            tasks[j].finalizeTask();
                            m_classifiersForFolds.add(new Tuple2<Tuple2<BitSet, Integer>, Classifier>(
                                    new Tuple2<BitSet, Integer>(arg0._1(), j), tasks[j].getClassifier()));
                        }

                        return m_classifiersForFolds;
                    }

                });
        mapFolds = mapFolds.persist(StorageLevel.MEMORY_AND_DISK());
        // memory and disk here for fast access and to avoid
        // recomputing partial classifiers if all partial classifiers
        // can't fit in memory

        // reduce fold models
        logMessage("[WekaClassifierEvaluation] Phase 1 (reduce), iteration " + (i + 1));
        JavaPairRDD<Tuple2<BitSet, Integer>, Classifier> reducedByFold = mapFolds.groupByKey().mapToPair(
                new PairFunction<Tuple2<Tuple2<BitSet, Integer>, Iterable<Classifier>>, Tuple2<BitSet, Integer>, Classifier>() {
                    /** For serialization */
                    private static final long serialVersionUID = 2481672301097842496L;

                    @Override
                    public Tuple2<Tuple2<BitSet, Integer>, Classifier> call(
                            Tuple2<Tuple2<BitSet, Integer>, Iterable<Classifier>> arg0)
                            throws Exception, DistributedWekaException {

                        Iterator<Classifier> split = arg0._2().iterator();
                        //            
                        //              int foldNum = -1;
                        //
                        List<Classifier> classifiers = new ArrayList<Classifier>();

                        while (split.hasNext()) {
                            classifiers.add(split.next());
                        }
                        //                Tuple2<Integer, Classifier> partial = split.next();
                        //                if (foldNum < 0) {
                        //                  foldNum = partial._1().intValue();
                        //                } else {
                        //                  if (partial._1().intValue() != foldNum) {
                        //                    throw new DistributedWekaException(
                        //                      "[WekaClassifierEvaluation] build "
                        //                        + "classifiers reduce phase: was not expecting fold number "
                        //                        + "to change within a partition!");
                        //                  }
                        //                }
                        //                classifiers.add(partial._2());
                        //              }

                        WekaClassifierReduceTask reduceTask = new WekaClassifierReduceTask();
                        Classifier intermediateClassifier = reduceTask.aggregate(classifiers, null, forceVote);

                        return new Tuple2<Tuple2<BitSet, Integer>, Classifier>(arg0._1(),
                                intermediateClassifier);
                    }

                });

        List<Tuple2<Tuple2<BitSet, Integer>, Classifier>> aggregated = reducedByFold.collect();
        for (Tuple2<Tuple2<BitSet, Integer>, Classifier> t : aggregated) {
            // this makes my head hurts!
            foldClassifiers.get(t._1()._1())[t._1()._2()] = t._2();
        }

        mapFolds.unpersist();
        reducedByFold.unpersist();
    }

    return foldClassifiers;
}

From source file:weka.distributed.spark.WekaAttributeSelectionSparkJob.java

License:Open Source License

protected Tuple2<Double, BitSet> EvaluateSubset(BitSet[] subsetList, JavaRDD<Instance> dataSet,
        Instances headerNoSummary, Instances headerWithSummary) throws Exception {
    //     // this is for performance evaluation
    //     int index = 0;
    //     for (BitSet subset : subsetList)
    //     {/*w w w.j a va  2s.c o  m*/
    //        subset.clear();
    //        subset.flip(0,20);
    //        subset.flip(index++);
    //     }

    logMessage("Evaluate Subsets: ");
    for (BitSet n : subsetList) {
        logMessage(n.toString());
    }

    final BitSet[] finalSubsets = subsetList;

    JavaPairRDD<BitSet, Instance> bitsetInstanceData = dataSet
            .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Instance>, BitSet, Instance>() {

                /**
                 * 
                 */
                private static final long serialVersionUID = -7672702819274482635L;

                @Override
                public Iterable<Tuple2<BitSet, Instance>> call(Iterator<Instance> split) throws Exception {

                    List<Tuple2<BitSet, Instance>> returnValue = new ArrayList<Tuple2<BitSet, Instance>>();

                    while (split.hasNext()) {
                        Instance current = split.next();

                        for (BitSet s : finalSubsets) {
                            returnValue.add(new Tuple2<BitSet, Instance>(s, current));
                        }
                    }

                    return returnValue;
                }

            }, true);

    JavaPairRDD<BitSet, Iterable<Instance>> groupedData = bitsetInstanceData.groupByKey();

    Map<BitSet, Classifier[]> foldClassifiers = phaseOneBuildClassifiers(groupedData, subsetList,
            headerNoSummary);

    logMessage("Phase 1 done");
    for (Map.Entry<BitSet, Classifier[]> entry : foldClassifiers.entrySet()) {
        logMessage("Bitset: " + entry.getKey().toString() + " Classifiers: " + entry.getValue().length);
    }

    List<Tuple2<BitSet, Evaluation>> results = phaseTwoEvaluateClassifiers(groupedData, headerWithSummary,
            headerNoSummary, foldClassifiers);

    // get best result
    Double bestEval = Double.NEGATIVE_INFINITY;
    BitSet bestSubset = null;

    logMessage("Result count = " + results.size());

    for (Tuple2<BitSet, Evaluation> result : results) {
        Double eval = Double.MIN_VALUE;
        if (headerNoSummary.classAttribute().isNominal()) {
            eval = -result._2().errorRate();
        } else {
            eval = -result._2().meanAbsoluteError();
        }

        if (eval.isNaN())
            eval = (double) -100000000;

        logMessage("Result = " + result._1().toString() + " ==> " + eval);

        if (eval > bestEval) {
            bestEval = eval;
            bestSubset = result._1();
        }
    }

    logMessage("Best Result = " + bestSubset.toString() + " ==> " + bestEval);

    return new Tuple2<Double, BitSet>(bestEval, bestSubset);
}

From source file:weka.distributed.spark.WekaClassifierEvaluationSparkJob.java

License:Open Source License

protected Classifier[] phaseOneBuildClassifiers(JavaRDD<Instance> dataset, final Instances headerNoSummary,
        final PreconstructedFilter preconstructedFilter) throws Exception {

    int totalFolds = 1;
    final String classifierMapTaskOptions = environmentSubstitute(
            m_classifierJob.getClassifierMapTaskOptions());
    String[] cOpts = Utils.splitOptions(classifierMapTaskOptions);
    String numFolds = Utils.getOption("total-folds", cOpts.clone());
    final boolean forceVote = Utils.getFlag("force-vote", cOpts.clone());
    if (!DistributedJobConfig.isEmpty(numFolds)) {
        totalFolds = Integer.parseInt(numFolds);
    }//w w w . j a va2  s  .  c om
    final int tFolds = totalFolds;

    final Classifier[] foldClassifiers = new Classifier[totalFolds];

    // just use headerNoSummary for class index
    final int classIndex = headerNoSummary.classIndex();
    final int numPartitions = dataset.partitions().size();

    int numIterations = m_classifierJob.getNumIterations();

    final int numSplits = dataset.partitions().size();

    for (int i = 0; i < numIterations; i++) {
        final int iterationNum = i;
        logMessage("[WekaClassifierEvaluation] Phase 1 (map), iteration " + (i + 1));

        JavaPairRDD<Integer, Classifier> mapFolds = dataset
                .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Instance>, Integer, Classifier>() {

                    /** For serialization */
                    private static final long serialVersionUID = -1906414304952140395L;

                    protected Instances m_header;

                    /** Holds results */
                    protected List<Tuple2<Integer, Classifier>> m_classifiersForFolds = new ArrayList<Tuple2<Integer, Classifier>>();

                    @Override
                    public Iterable<Tuple2<Integer, Classifier>> call(Iterator<Instance> split)
                            throws IOException, DistributedWekaException {

                        Instance current = split.next();
                        if (current == null) {
                            throw new IOException("No data in this partition!!");
                        }

                        m_header = current.dataset();
                        m_header.setClassIndex(classIndex);
                        // WekaClassifierMapTask tempTask = new WekaClassifierMapTask();
                        // try {
                        // WekaClassifierSparkJob.configureClassifierMapTask(tempTask,
                        // null, classifierMapTaskOptions, iterationNum,
                        // preconstructedFilter, numSplits);
                        // } catch (Exception ex) {
                        // throw new DistributedWekaException(ex);
                        // }
                        //
                        // boolean isUpdateableClassifier = tempTask.getClassifier()
                        // instanceof UpdateableClassifier;
                        // boolean forceBatchForUpdateable =
                        // tempTask.getForceBatchLearningForUpdateableClassifiers();

                        WekaClassifierMapTask[] tasks = new WekaClassifierMapTask[tFolds];
                        for (int j = 0; j < tFolds; j++) {
                            try {
                                tasks[j] = new WekaClassifierMapTask();
                                WekaClassifierSparkJob.configureClassifierMapTask(tasks[j], foldClassifiers[j],
                                        classifierMapTaskOptions, iterationNum, preconstructedFilter,
                                        numSplits);

                                // set fold number and total folds
                                tasks[j].setFoldNumber(j + 1);
                                tasks[j].setTotalNumFolds(tFolds);
                                Environment env = new Environment();
                                env.addVariable(WekaClassifierMapTask.TOTAL_NUMBER_OF_MAPS, "" + numPartitions);
                                tasks[j].setEnvironment(env);
                            } catch (Exception ex) {
                                logMessage(ex);
                                throw new DistributedWekaException(ex);
                            }

                            // initialize
                            tasks[j].setup(headerNoSummary);
                        }

                        while (split.hasNext()) {
                            current = split.next();

                            for (int j = 0; j < tFolds; j++) {
                                tasks[j].processInstance(current);
                            }
                        }

                        for (int j = 0; j < tFolds; j++) {
                            tasks[j].finalizeTask();
                            m_classifiersForFolds
                                    .add(new Tuple2<Integer, Classifier>(j, tasks[j].getClassifier()));
                        }

                        return m_classifiersForFolds;
                    }
                });
        mapFolds = mapFolds.persist(StorageLevel.MEMORY_AND_DISK());
        JavaPairRDD<Integer, Classifier> mapFoldsSorted = mapFolds.sortByKey();// .persist(StorageLevel.MEMORY_AND_DISK());
        mapFoldsSorted = mapFoldsSorted.partitionBy(new IntegerKeyPartitioner(totalFolds))
                .persist(StorageLevel.MEMORY_AND_DISK());

        // memory and disk here for fast access and to avoid
        // recomputing partial classifiers if all partial classifiers
        // can't fit in memory

        // reduce fold models
        logMessage("[WekaClassifierEvaluation] Phase 1 (reduce), iteration " + (i + 1));
        JavaPairRDD<Integer, Classifier> reducedByFold = mapFoldsSorted.mapPartitionsToPair(
                new PairFlatMapFunction<Iterator<Tuple2<Integer, Classifier>>, Integer, Classifier>() {

                    /** For serialization */
                    private static final long serialVersionUID = 2481672301097842496L;

                    /** Holds reduced classifier for one fold (partition) */
                    protected List<Tuple2<Integer, Classifier>> m_reducedForFold = new ArrayList<Tuple2<Integer, Classifier>>();

                    @Override
                    public Iterable<Tuple2<Integer, Classifier>> call(
                            Iterator<Tuple2<Integer, Classifier>> split) throws DistributedWekaException {

                        int foldNum = -1;

                        List<Classifier> classifiers = new ArrayList<Classifier>();
                        while (split.hasNext()) {
                            Tuple2<Integer, Classifier> partial = split.next();
                            if (foldNum < 0) {
                                foldNum = partial._1().intValue();
                            } else {
                                if (partial._1().intValue() != foldNum) {
                                    throw new DistributedWekaException("[WekaClassifierEvaluation] build "
                                            + "classifiers reduce phase: was not expecting fold number "
                                            + "to change within a partition!");
                                }
                            }
                            classifiers.add(partial._2());
                        }

                        WekaClassifierReduceTask reduceTask = new WekaClassifierReduceTask();
                        Classifier intermediateClassifier = reduceTask.aggregate(classifiers, null, forceVote);

                        m_reducedForFold.add(new Tuple2<Integer, Classifier>(foldNum, intermediateClassifier));

                        return m_reducedForFold;
                    }
                });

        List<Tuple2<Integer, Classifier>> aggregated = reducedByFold.collect();
        for (Tuple2<Integer, Classifier> t : aggregated) {
            foldClassifiers[t._1()] = t._2();
        }

        mapFolds.unpersist();
        mapFoldsSorted.unpersist();
        reducedByFold.unpersist();
    }

    return foldClassifiers;
}

From source file:wordcount.Spark_WordCountEachLine.java

License:Apache License

public static void main(String[] args) {

    if (args.length != 3)
        throw new IllegalArgumentException(
                "Usage: " + Spark_WordCountEachLine.class.getName() + " <inputDir> <outputDir> <numCores>");

    long startTime = System.currentTimeMillis();

    String inputDir = args[0];//from www.  j  a  v a2  s.  c o m
    String outputDir = args[1];
    int numCores = Integer.parseInt(args[2]);

    SparkConf conf = new SparkConf().setAppName("Spark word count");
    conf.setMaster("local[" + numCores + "]");
    JavaSparkContext sc = new JavaSparkContext(conf);
    JavaRDD<String> textLines = sc.textFile(inputDir + "/*");
    List<scala.Tuple2<String, Integer>> res = textLines
            .flatMapToPair(new PairFlatMapFunction<String, String, Integer>() {
                @Override
                public Iterable<scala.Tuple2<String, Integer>> call(String line) throws Exception {
                    if (line.isEmpty())
                        return new ArrayList<scala.Tuple2<String, Integer>>();
                    if (line.startsWith("<doc") || line.startsWith("</doc"))
                        return new ArrayList<scala.Tuple2<String, Integer>>();

                    ArrayList<scala.Tuple2<String, Integer>> listValues = new ArrayList<scala.Tuple2<String, Integer>>();
                    String[] a = pattern.split(line);
                    for (int i = 0; i < a.length; i++) {
                        String w = a[i];
                        if (w.isEmpty())
                            continue;
                        w = w.toLowerCase();
                        scala.Tuple2<String, Integer> t = new scala.Tuple2<String, Integer>(w, 1);
                        listValues.add(t);
                    }
                    return listValues;
                }
            }).reduceByKey(new Function2<Integer, Integer, Integer>() {
                @Override
                public Integer call(Integer val1, Integer val2) throws Exception {
                    return val1 + val2;
                }
            }).collect();

    // Write results.
    StringBuilder sb = new StringBuilder();
    Iterator<scala.Tuple2<String, Integer>> keys = res.iterator();
    while (keys.hasNext()) {
        scala.Tuple2<String, Integer> tuple = keys.next();
        String k = tuple._1();
        int v = tuple._2();
        sb.append("Word: " + k + " Occurrences: " + v + "\n");
    }
    writeTextFile(outputDir + "/results.txt", sb.toString());

    long endTime = System.currentTimeMillis();
    System.out.println("Done! Execution time: " + (endTime - startTime) + " milliseconds.");
}