List of usage examples for org.apache.spark.api.java.function PairFlatMapFunction PairFlatMapFunction
PairFlatMapFunction
From source file:weka.distributed.spark.RandomizedDataSparkJob.java
License:Open Source License
/** * Perform the randomization (and stratification) in the case where the input * data does not contain string or relational attributes. In this case, our * final RDD can contain instances objects, which will avoid further parsing * in subsequent jobs.//w w w. j a va2s .com * * @param input * @param outputPath * @param numFoldSlices * @param random * @param headerWithSummary * @param classIndex the classIndex to use * @throws IOException * @throws DistributedWekaException */ protected void performRandomShuffle(JavaRDD<Instance> input, String outputPath, final int numFoldSlices, final Random random, final Instances headerWithSummary, int classIndex) throws IOException, DistributedWekaException { final Instances headerNoSummary = CSVToARFFHeaderReduceTask.stripSummaryAtts(headerWithSummary); headerNoSummary.setClassIndex(classIndex); boolean containsStringOrRelational = headerNoSummary.checkForStringAttributes() || headerNoSummary.checkForAttributeType(Attribute.RELATIONAL); final PhaseOneRandomization phaseOne = new PhaseOneRandomization(headerNoSummary, getCSVMapTaskOptions(), containsStringOrRelational, random, numFoldSlices); // Phase 1 - randomly shuffle the data logMessage("[Randomly shuffle data] Starting phase 1..."); JavaPairRDD<Integer, Object> mapResults = input .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Instance>, Integer, Object>() { /** For serialization */ private static final long serialVersionUID = -5351850875358513817L; protected List<Tuple2<Integer, Object>> m_randomizedRows = new ArrayList<Tuple2<Integer, Object>>(); // protected CSVToARFFHeaderMapTask m_rowHelper; @Override public Iterable<Tuple2<Integer, Object>> call(Iterator<Instance> split) throws IOException, DistributedWekaException { while (split.hasNext()) { Instance row = split.next(); Tuple2<Integer, Object> processed = phaseOne.process(row); m_randomizedRows.add(processed); } // System.err.println("****** Number in partition: " + m_count); return m_randomizedRows; } }).persist(getCachingStrategy().getStorageLevel()); // Now sort into ascending order of random assignment number JavaPairRDD<Integer, Object> sortedByAssignment = mapResults.sortByKey(true) .partitionBy(new IntegerKeyPartitioner(numFoldSlices)) .persist(getCachingStrategy().getStorageLevel()); sortedByAssignment.count(); // discard mapResults mapResults.unpersist(); mapResults = null; // List<Tuple2<Integer, Object>> tmpData = sortedByAssignment.collect(); // for (Tuple2<Integer, Object> row : tmpData) { // ((Instance) row._2()).setDataset(headerNoSummary); // System.err.println(row._1() + ": " + row._2().toString()); // } if (headerNoSummary.classIndex() < 0 || headerNoSummary.classAttribute().isNumeric()) { // No need for the second phase of dealing classes out to splits // if there is no class or a numeric class // m_sortedByFold = sortedByAssignment; // , true here because we preserve the partitions from sortedByAssignment JavaRDD<Instance> finalDataSet = sortedByAssignment .mapPartitions(new FlatMapFunction<Iterator<Tuple2<Integer, Object>>, Instance>() { /** * For serialization */ private static final long serialVersionUID = -4129157509045217459L; List<Instance> m_list = new ArrayList<Instance>(); @Override public Iterable<Instance> call(Iterator<Tuple2<Integer, Object>> split) { while (split.hasNext()) { // make sure each instance has a reference to the header Instance nextI = (Instance) split.next()._2(); nextI.setDataset(headerNoSummary); m_list.add(nextI); } return m_list; } }, true).persist(getCachingStrategy().getStorageLevel()); finalDataSet.count(); // materialize this RDD logMessage("[Randomly shuffle data] Unpersisting sorted phase 1 RDD"); sortedByAssignment.unpersist(); sortedByAssignment = null; m_sortedByFold = finalDataSet; } else { // phase 2 - deal classes out to splits + oversample minority classes final PhaseTwoStratification phaseTwo = new PhaseTwoStratification(headerNoSummary, numFoldSlices, false); logMessage("[Randomly shuffle data] Starting phase 2 (deal to folds/stratification)..."); JavaPairRDD<Integer, Object> dealtToFolds = sortedByAssignment.mapPartitionsToPair( new PairFlatMapFunction<Iterator<Tuple2<Integer, Object>>, Integer, Object>() { /** * For serialization */ private static final long serialVersionUID = -5903374381393577497L; protected List<Tuple2<Integer, Object>> m_dealtRows = new ArrayList<Tuple2<Integer, Object>>(); @Override public Iterable<Tuple2<Integer, Object>> call(Iterator<Tuple2<Integer, Object>> split) { while (split.hasNext()) { Tuple2<Integer, Object> current = split.next(); Tuple2<Integer, Object> result = phaseTwo.process(current._2); m_dealtRows.add(result); } phaseTwo.checkForMinorityClassCases(m_dealtRows); return m_dealtRows; } }).persist(getCachingStrategy().getStorageLevel()); // discard sortedByAssignment logMessage("[Randomly shuffle data] Repartitioning phase 2 RDD according to fold number"); JavaPairRDD<Integer, Object> tmpSortedByFold = dealtToFolds.sortByKey() .partitionBy(new IntegerKeyPartitioner(numFoldSlices)) .persist(getCachingStrategy().getStorageLevel()); tmpSortedByFold.count(); sortedByAssignment.unpersist(); sortedByAssignment = null; dealtToFolds.unpersist(); dealtToFolds = null; // writeRandomizedSplits(outputPath, dealtToFolds); // List<Tuple2<Integer, Object>> tmpData = dealtToFolds.collect(); // for (Tuple2<Integer, Object> row : tmpData) { // ((Instance) row._2()).setDataset(headerNoSummary); // System.err.println(row._1() + ": " + row._2().toString()); // } // m_sortedByFold = dealtToFolds.sortByKey(true); logMessage("[Randomly shuffle data] Creating and persisting final dataset (RDD<Instance>)..."); JavaRDD<Instance> finalDataSet = null; if (!containsStringOrRelational) { finalDataSet = tmpSortedByFold .mapPartitions(new FlatMapFunction<Iterator<Tuple2<Integer, Object>>, Instance>() { /** * For serialization */ private static final long serialVersionUID = 5425826829981136102L; List<Instance> m_list = new ArrayList<Instance>(); @Override public Iterable<Instance> call(Iterator<Tuple2<Integer, Object>> split) { while (split.hasNext()) { // make sure that each instance has access to the header Instance nextI = (Instance) split.next()._2(); nextI.setDataset(headerNoSummary); m_list.add(nextI); } return m_list; } }, true).persist(getCachingStrategy().getStorageLevel()); } else { CSVToInstancePairFlatMapFunction instanceFunction = new CSVToInstancePairFlatMapFunction( headerNoSummary, getCSVMapTaskOptions()); // , true here because we preserve the partitions from tmpSortedByFold finalDataSet = tmpSortedByFold.mapPartitions(instanceFunction, true) .persist(getCachingStrategy().getStorageLevel()); } logMessage("[Randomly shuffle data] forcing materialization of final shuffled data"); finalDataSet.count(); logMessage("[Randomly shuffle data] Unpersisting intermediate RDDs"); tmpSortedByFold.unpersist(); tmpSortedByFold = null; m_sortedByFold = finalDataSet; logMessage("[Randomly shuffle data] Finished shuffling/stratifying RDD. Number of partitions: " + m_sortedByFold.partitions().size()); } setDataset(TRAINING_DATA, new Dataset<Instance>(m_sortedByFold, headerWithSummary)); if (m_writeRandomizedDataToOutput) { writeRandomizedSplits(outputPath, m_sortedByFold); } }
From source file:weka.distributed.spark.WekaAttributeSelectionSparkJob.java
License:Open Source License
protected Map<BitSet, Classifier[]> phaseOneBuildClassifiers(JavaPairRDD<BitSet, Iterable<Instance>> dataset, BitSet[] subsetList, final Instances headerNoSummary) throws Exception { int totalFolds = 1; final String classifierMapTaskOptions = environmentSubstitute( m_classifierJob.getClassifierMapTaskOptions()); String[] cOpts = Utils.splitOptions(classifierMapTaskOptions); String numFolds = Utils.getOption("total-folds", cOpts.clone()); final boolean forceVote = Utils.getFlag("force-vote", cOpts.clone()); if (!DistributedJobConfig.isEmpty(numFolds)) { totalFolds = Integer.parseInt(numFolds); }//from w w w .j av a2 s . c om final int tFolds = totalFolds; final Map<BitSet, Classifier[]> foldClassifiers = new HashMap<BitSet, Classifier[]>(); for (BitSet subset : subsetList) { foldClassifiers.put(subset, new Classifier[totalFolds]); } // just use headerNoSummary for class index final int classIndex = headerNoSummary.classIndex(); final int numPartitions = dataset.partitions().size(); int numIterations = m_classifierJob.getNumIterations(); final int numSplits = dataset.partitions().size(); for (int i = 0; i < numIterations; i++) { final int iterationNum = i; logMessage("[WekaClassifierEvaluation] Phase 1 (map), iteration " + (i + 1)); JavaPairRDD<Tuple2<BitSet, Integer>, Classifier> mapFolds = dataset.flatMapToPair( new PairFlatMapFunction<Tuple2<BitSet, Iterable<Instance>>, Tuple2<BitSet, Integer>, Classifier>() { /** For serialization */ private static final long serialVersionUID = -1906414304952140395L; protected Instances m_header; /** Holds results */ protected List<Tuple2<Tuple2<BitSet, Integer>, Classifier>> m_classifiersForFolds = new ArrayList<Tuple2<Tuple2<BitSet, Integer>, Classifier>>(); // // @Override // public Tuple2<Integer, Classifier> call( // Tuple2<PreconstructedFilter, Iterable<Instance>> arg0) // throws Exception { // // TODO Auto-generated method stub // return null; // } @Override public Iterable<Tuple2<Tuple2<BitSet, Integer>, Classifier>> call( Tuple2<BitSet, Iterable<Instance>> arg0) throws IOException, DistributedWekaException { PreconstructedFilter preconstructedFilter = GetFilterFromBitSet(arg0._1(), headerNoSummary); Iterator<Instance> split = arg0._2().iterator(); Instance current = split.next(); if (current == null) { throw new IOException("No data in this partition!!"); } m_header = current.dataset(); m_header.setClassIndex(classIndex); // WekaClassifierMapTask tempTask = new WekaClassifierMapTask(); // try { // WekaClassifierSparkJob.configureClassifierMapTask(tempTask, // null, classifierMapTaskOptions, iterationNum, // preconstructedFilter, numSplits); // } catch (Exception ex) { // throw new DistributedWekaException(ex); // } // // boolean isUpdateableClassifier = tempTask.getClassifier() // instanceof UpdateableClassifier; // boolean forceBatchForUpdateable = // tempTask.getForceBatchLearningForUpdateableClassifiers(); WekaClassifierMapTask[] tasks = new WekaClassifierMapTask[tFolds]; for (int j = 0; j < tFolds; j++) { try { tasks[j] = new WekaClassifierMapTask(); WekaClassifierSparkJob.configureClassifierMapTask(tasks[j], foldClassifiers.get(arg0._1())[j], classifierMapTaskOptions, iterationNum, preconstructedFilter, numSplits); // set fold number and total folds tasks[j].setFoldNumber(j + 1); tasks[j].setTotalNumFolds(tFolds); Environment env = new Environment(); env.addVariable(WekaClassifierMapTask.TOTAL_NUMBER_OF_MAPS, "" + numPartitions); tasks[j].setEnvironment(env); } catch (Exception ex) { logMessage(ex); throw new DistributedWekaException(ex); } // initialize tasks[j].setup(headerNoSummary); } while (split.hasNext()) { current = split.next(); for (int j = 0; j < tFolds; j++) { tasks[j].processInstance(current); } } for (int j = 0; j < tFolds; j++) { tasks[j].finalizeTask(); m_classifiersForFolds.add(new Tuple2<Tuple2<BitSet, Integer>, Classifier>( new Tuple2<BitSet, Integer>(arg0._1(), j), tasks[j].getClassifier())); } return m_classifiersForFolds; } }); mapFolds = mapFolds.persist(StorageLevel.MEMORY_AND_DISK()); // memory and disk here for fast access and to avoid // recomputing partial classifiers if all partial classifiers // can't fit in memory // reduce fold models logMessage("[WekaClassifierEvaluation] Phase 1 (reduce), iteration " + (i + 1)); JavaPairRDD<Tuple2<BitSet, Integer>, Classifier> reducedByFold = mapFolds.groupByKey().mapToPair( new PairFunction<Tuple2<Tuple2<BitSet, Integer>, Iterable<Classifier>>, Tuple2<BitSet, Integer>, Classifier>() { /** For serialization */ private static final long serialVersionUID = 2481672301097842496L; @Override public Tuple2<Tuple2<BitSet, Integer>, Classifier> call( Tuple2<Tuple2<BitSet, Integer>, Iterable<Classifier>> arg0) throws Exception, DistributedWekaException { Iterator<Classifier> split = arg0._2().iterator(); // // int foldNum = -1; // List<Classifier> classifiers = new ArrayList<Classifier>(); while (split.hasNext()) { classifiers.add(split.next()); } // Tuple2<Integer, Classifier> partial = split.next(); // if (foldNum < 0) { // foldNum = partial._1().intValue(); // } else { // if (partial._1().intValue() != foldNum) { // throw new DistributedWekaException( // "[WekaClassifierEvaluation] build " // + "classifiers reduce phase: was not expecting fold number " // + "to change within a partition!"); // } // } // classifiers.add(partial._2()); // } WekaClassifierReduceTask reduceTask = new WekaClassifierReduceTask(); Classifier intermediateClassifier = reduceTask.aggregate(classifiers, null, forceVote); return new Tuple2<Tuple2<BitSet, Integer>, Classifier>(arg0._1(), intermediateClassifier); } }); List<Tuple2<Tuple2<BitSet, Integer>, Classifier>> aggregated = reducedByFold.collect(); for (Tuple2<Tuple2<BitSet, Integer>, Classifier> t : aggregated) { // this makes my head hurts! foldClassifiers.get(t._1()._1())[t._1()._2()] = t._2(); } mapFolds.unpersist(); reducedByFold.unpersist(); } return foldClassifiers; }
From source file:weka.distributed.spark.WekaAttributeSelectionSparkJob.java
License:Open Source License
protected Tuple2<Double, BitSet> EvaluateSubset(BitSet[] subsetList, JavaRDD<Instance> dataSet, Instances headerNoSummary, Instances headerWithSummary) throws Exception { // // this is for performance evaluation // int index = 0; // for (BitSet subset : subsetList) // {/*w w w.j a va 2s.c o m*/ // subset.clear(); // subset.flip(0,20); // subset.flip(index++); // } logMessage("Evaluate Subsets: "); for (BitSet n : subsetList) { logMessage(n.toString()); } final BitSet[] finalSubsets = subsetList; JavaPairRDD<BitSet, Instance> bitsetInstanceData = dataSet .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Instance>, BitSet, Instance>() { /** * */ private static final long serialVersionUID = -7672702819274482635L; @Override public Iterable<Tuple2<BitSet, Instance>> call(Iterator<Instance> split) throws Exception { List<Tuple2<BitSet, Instance>> returnValue = new ArrayList<Tuple2<BitSet, Instance>>(); while (split.hasNext()) { Instance current = split.next(); for (BitSet s : finalSubsets) { returnValue.add(new Tuple2<BitSet, Instance>(s, current)); } } return returnValue; } }, true); JavaPairRDD<BitSet, Iterable<Instance>> groupedData = bitsetInstanceData.groupByKey(); Map<BitSet, Classifier[]> foldClassifiers = phaseOneBuildClassifiers(groupedData, subsetList, headerNoSummary); logMessage("Phase 1 done"); for (Map.Entry<BitSet, Classifier[]> entry : foldClassifiers.entrySet()) { logMessage("Bitset: " + entry.getKey().toString() + " Classifiers: " + entry.getValue().length); } List<Tuple2<BitSet, Evaluation>> results = phaseTwoEvaluateClassifiers(groupedData, headerWithSummary, headerNoSummary, foldClassifiers); // get best result Double bestEval = Double.NEGATIVE_INFINITY; BitSet bestSubset = null; logMessage("Result count = " + results.size()); for (Tuple2<BitSet, Evaluation> result : results) { Double eval = Double.MIN_VALUE; if (headerNoSummary.classAttribute().isNominal()) { eval = -result._2().errorRate(); } else { eval = -result._2().meanAbsoluteError(); } if (eval.isNaN()) eval = (double) -100000000; logMessage("Result = " + result._1().toString() + " ==> " + eval); if (eval > bestEval) { bestEval = eval; bestSubset = result._1(); } } logMessage("Best Result = " + bestSubset.toString() + " ==> " + bestEval); return new Tuple2<Double, BitSet>(bestEval, bestSubset); }
From source file:weka.distributed.spark.WekaClassifierEvaluationSparkJob.java
License:Open Source License
protected Classifier[] phaseOneBuildClassifiers(JavaRDD<Instance> dataset, final Instances headerNoSummary, final PreconstructedFilter preconstructedFilter) throws Exception { int totalFolds = 1; final String classifierMapTaskOptions = environmentSubstitute( m_classifierJob.getClassifierMapTaskOptions()); String[] cOpts = Utils.splitOptions(classifierMapTaskOptions); String numFolds = Utils.getOption("total-folds", cOpts.clone()); final boolean forceVote = Utils.getFlag("force-vote", cOpts.clone()); if (!DistributedJobConfig.isEmpty(numFolds)) { totalFolds = Integer.parseInt(numFolds); }//w w w . j a va2 s . c om final int tFolds = totalFolds; final Classifier[] foldClassifiers = new Classifier[totalFolds]; // just use headerNoSummary for class index final int classIndex = headerNoSummary.classIndex(); final int numPartitions = dataset.partitions().size(); int numIterations = m_classifierJob.getNumIterations(); final int numSplits = dataset.partitions().size(); for (int i = 0; i < numIterations; i++) { final int iterationNum = i; logMessage("[WekaClassifierEvaluation] Phase 1 (map), iteration " + (i + 1)); JavaPairRDD<Integer, Classifier> mapFolds = dataset .mapPartitionsToPair(new PairFlatMapFunction<Iterator<Instance>, Integer, Classifier>() { /** For serialization */ private static final long serialVersionUID = -1906414304952140395L; protected Instances m_header; /** Holds results */ protected List<Tuple2<Integer, Classifier>> m_classifiersForFolds = new ArrayList<Tuple2<Integer, Classifier>>(); @Override public Iterable<Tuple2<Integer, Classifier>> call(Iterator<Instance> split) throws IOException, DistributedWekaException { Instance current = split.next(); if (current == null) { throw new IOException("No data in this partition!!"); } m_header = current.dataset(); m_header.setClassIndex(classIndex); // WekaClassifierMapTask tempTask = new WekaClassifierMapTask(); // try { // WekaClassifierSparkJob.configureClassifierMapTask(tempTask, // null, classifierMapTaskOptions, iterationNum, // preconstructedFilter, numSplits); // } catch (Exception ex) { // throw new DistributedWekaException(ex); // } // // boolean isUpdateableClassifier = tempTask.getClassifier() // instanceof UpdateableClassifier; // boolean forceBatchForUpdateable = // tempTask.getForceBatchLearningForUpdateableClassifiers(); WekaClassifierMapTask[] tasks = new WekaClassifierMapTask[tFolds]; for (int j = 0; j < tFolds; j++) { try { tasks[j] = new WekaClassifierMapTask(); WekaClassifierSparkJob.configureClassifierMapTask(tasks[j], foldClassifiers[j], classifierMapTaskOptions, iterationNum, preconstructedFilter, numSplits); // set fold number and total folds tasks[j].setFoldNumber(j + 1); tasks[j].setTotalNumFolds(tFolds); Environment env = new Environment(); env.addVariable(WekaClassifierMapTask.TOTAL_NUMBER_OF_MAPS, "" + numPartitions); tasks[j].setEnvironment(env); } catch (Exception ex) { logMessage(ex); throw new DistributedWekaException(ex); } // initialize tasks[j].setup(headerNoSummary); } while (split.hasNext()) { current = split.next(); for (int j = 0; j < tFolds; j++) { tasks[j].processInstance(current); } } for (int j = 0; j < tFolds; j++) { tasks[j].finalizeTask(); m_classifiersForFolds .add(new Tuple2<Integer, Classifier>(j, tasks[j].getClassifier())); } return m_classifiersForFolds; } }); mapFolds = mapFolds.persist(StorageLevel.MEMORY_AND_DISK()); JavaPairRDD<Integer, Classifier> mapFoldsSorted = mapFolds.sortByKey();// .persist(StorageLevel.MEMORY_AND_DISK()); mapFoldsSorted = mapFoldsSorted.partitionBy(new IntegerKeyPartitioner(totalFolds)) .persist(StorageLevel.MEMORY_AND_DISK()); // memory and disk here for fast access and to avoid // recomputing partial classifiers if all partial classifiers // can't fit in memory // reduce fold models logMessage("[WekaClassifierEvaluation] Phase 1 (reduce), iteration " + (i + 1)); JavaPairRDD<Integer, Classifier> reducedByFold = mapFoldsSorted.mapPartitionsToPair( new PairFlatMapFunction<Iterator<Tuple2<Integer, Classifier>>, Integer, Classifier>() { /** For serialization */ private static final long serialVersionUID = 2481672301097842496L; /** Holds reduced classifier for one fold (partition) */ protected List<Tuple2<Integer, Classifier>> m_reducedForFold = new ArrayList<Tuple2<Integer, Classifier>>(); @Override public Iterable<Tuple2<Integer, Classifier>> call( Iterator<Tuple2<Integer, Classifier>> split) throws DistributedWekaException { int foldNum = -1; List<Classifier> classifiers = new ArrayList<Classifier>(); while (split.hasNext()) { Tuple2<Integer, Classifier> partial = split.next(); if (foldNum < 0) { foldNum = partial._1().intValue(); } else { if (partial._1().intValue() != foldNum) { throw new DistributedWekaException("[WekaClassifierEvaluation] build " + "classifiers reduce phase: was not expecting fold number " + "to change within a partition!"); } } classifiers.add(partial._2()); } WekaClassifierReduceTask reduceTask = new WekaClassifierReduceTask(); Classifier intermediateClassifier = reduceTask.aggregate(classifiers, null, forceVote); m_reducedForFold.add(new Tuple2<Integer, Classifier>(foldNum, intermediateClassifier)); return m_reducedForFold; } }); List<Tuple2<Integer, Classifier>> aggregated = reducedByFold.collect(); for (Tuple2<Integer, Classifier> t : aggregated) { foldClassifiers[t._1()] = t._2(); } mapFolds.unpersist(); mapFoldsSorted.unpersist(); reducedByFold.unpersist(); } return foldClassifiers; }
From source file:wordcount.Spark_WordCountEachLine.java
License:Apache License
public static void main(String[] args) { if (args.length != 3) throw new IllegalArgumentException( "Usage: " + Spark_WordCountEachLine.class.getName() + " <inputDir> <outputDir> <numCores>"); long startTime = System.currentTimeMillis(); String inputDir = args[0];//from www. j a v a2 s. c o m String outputDir = args[1]; int numCores = Integer.parseInt(args[2]); SparkConf conf = new SparkConf().setAppName("Spark word count"); conf.setMaster("local[" + numCores + "]"); JavaSparkContext sc = new JavaSparkContext(conf); JavaRDD<String> textLines = sc.textFile(inputDir + "/*"); List<scala.Tuple2<String, Integer>> res = textLines .flatMapToPair(new PairFlatMapFunction<String, String, Integer>() { @Override public Iterable<scala.Tuple2<String, Integer>> call(String line) throws Exception { if (line.isEmpty()) return new ArrayList<scala.Tuple2<String, Integer>>(); if (line.startsWith("<doc") || line.startsWith("</doc")) return new ArrayList<scala.Tuple2<String, Integer>>(); ArrayList<scala.Tuple2<String, Integer>> listValues = new ArrayList<scala.Tuple2<String, Integer>>(); String[] a = pattern.split(line); for (int i = 0; i < a.length; i++) { String w = a[i]; if (w.isEmpty()) continue; w = w.toLowerCase(); scala.Tuple2<String, Integer> t = new scala.Tuple2<String, Integer>(w, 1); listValues.add(t); } return listValues; } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer val1, Integer val2) throws Exception { return val1 + val2; } }).collect(); // Write results. StringBuilder sb = new StringBuilder(); Iterator<scala.Tuple2<String, Integer>> keys = res.iterator(); while (keys.hasNext()) { scala.Tuple2<String, Integer> tuple = keys.next(); String k = tuple._1(); int v = tuple._2(); sb.append("Word: " + k + " Occurrences: " + v + "\n"); } writeTextFile(outputDir + "/results.txt", sb.toString()); long endTime = System.currentTimeMillis(); System.out.println("Done! Execution time: " + (endTime - startTime) + " milliseconds."); }