List of usage examples for org.apache.hadoop.io BytesWritable getBytes
@Override public byte[] getBytes()
From source file:weka.distributed.hadoop.CSVToArffHeaderHadoopReducer.java
License:Open Source License
@Override public void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException { Configuration conf = context.getConfiguration(); String outputDestination = conf.get(CSV_TO_ARFF_HEADER_WRITE_PATH); if (outputDestination == null || outputDestination.length() == 0) { throw new IOException("No destination given for aggregated ARFF header"); }//from w w w.j a v a2 s . c o m List<Instances> headersToAgg = new ArrayList<Instances>(); List<HeaderAndQuantileDataHolder> holdersToAgg = new ArrayList<HeaderAndQuantileDataHolder>(); int counter = 0; try { for (BytesWritable b : values) { byte[] bytes = b.getBytes(); if (m_estimateQuantiles) { HeaderAndQuantileDataHolder holder = deserializeHolder(bytes); holdersToAgg.add(holder); } else { Instances aHeader = deserializeHeader(bytes); headersToAgg.add(aHeader); } counter++; } } catch (Exception ex) { throw new IOException(ex); } try { Instances aggregated = m_estimateQuantiles ? m_task.aggregateHeadersAndQuartiles(holdersToAgg) : m_task.aggregate(headersToAgg); writeHeaderToDestination(aggregated, outputDestination, conf); Text outkey = new Text(); outkey.set("AKey"); Text outval = new Text(); outval.set("Num headers aggregated " + counter); context.write(outkey, outval); } catch (Exception e) { throw new IOException(e); } }
From source file:weka.distributed.hadoop.KMeansCentroidSketchHadoopReducer.java
License:Open Source License
@Override public void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException { int runNum = 0; String rS = key.toString();// w w w. ja v a 2 s . com rS = rS.replace("run", ""); try { runNum = Integer.parseInt(rS); } catch (NumberFormatException ex) { throw new IOException(ex); } CentroidSketch initial = null; List<NormalizableDistance> distsForRun = new ArrayList<NormalizableDistance>(); try { for (BytesWritable b : values) { byte[] bytes = b.getBytes(); CentroidSketch current = deserialize(bytes); if (initial == null) { initial = current; } else { initial.aggregateReservoir(current.getReservoirSample()); } if (m_isFirstIteration) { distsForRun.add(current.getDistanceFunction()); } } // add the reservoir to the current sketch initial.addReservoirToCurrentSketch(); // update the distance function with global numeric // attribute ranges if (m_isFirstIteration) { Instances distancePrimingData = KMeansReduceTask .computeDistancePrimingDataFromDistanceFunctions(distsForRun, m_transformedHeaderNoSummary); initial.getDistanceFunction().setInstances(distancePrimingData); } // save the sketch out writeSketchToDestination(initial, m_outputDestination, runNum, context.getConfiguration()); System.err.println("Number of instances in sketch for run " + runNum + ": " + initial.getCurrentSketch().numInstances()); Text outKey = new Text(); outKey.set("Summary:\n"); Text outVal = new Text(); outVal.set("Number of instances in sketch for run " + runNum + ": " + initial.getCurrentSketch().numInstances()); context.write(outKey, outVal); } catch (Exception ex) { throw new IOException(ex); } }
From source file:weka.distributed.hadoop.KMeansHadoopReducer.java
License:Open Source License
@Override public void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException { int runNum = 0; String rS = key.toString();/*from ww w . j a va2 s . c o m*/ rS = rS.replace("run", ""); try { runNum = Integer.parseInt(rS); } catch (NumberFormatException ex) { throw new IOException(ex); } Instances transformedHeaderNoSummary = null; List<List<Instances>> partialClusterSummariesForRun = new ArrayList<List<Instances>>(); try { for (BytesWritable b : values) { byte[] bytes = b.getBytes(); KMeansMapTask current = deserialize(bytes); if (transformedHeaderNoSummary == null) { transformedHeaderNoSummary = current.getTransformedHeader(); } partialClusterSummariesForRun.add(current.getCentroidStats()); } } catch (Exception ex) { throw new IOException(ex); } if (transformedHeaderNoSummary != null && partialClusterSummariesForRun.size() > 0) { KMeansReduceTask reducer = new KMeansReduceTask(); try { reducer = reducer.reduceClusters(runNum, m_iterationNumber, transformedHeaderNoSummary, partialClusterSummariesForRun); writeReduceTaskToDestination(reducer, m_outputDestination, runNum, context.getConfiguration()); System.err.println("Wrote reducer for run: " + runNum + ". Total within clust err: " + reducer.getTotalWithinClustersError()); } catch (DistributedWekaException e) { throw new IOException(e); } } else { if (transformedHeaderNoSummary == null) { throw new IOException("Was unable to get the transformed header from the KMeansMapTasks!"); } if (partialClusterSummariesForRun.size() == 0) { throw new IOException("There were no custer summaries to aggregate!"); } } }
From source file:weka.distributed.hadoop.WekaClassifierEvaluationHadoopReducer.java
License:Open Source License
@Override public void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException { Configuration conf = context.getConfiguration(); String mapTaskOptsS = conf.get(WekaClassifierHadoopMapper.CLASSIFIER_MAP_TASK_OPTIONS); Instances trainingHeader = null;/*from w w w .ja v a 2 s .com*/ Instances headerWithoutSummaryAtts = null; // double[] priors = null; // double priorsCount = 0; String totalFolds = ""; String seed = "1"; String separateTestSet = ""; try { String[] taskOpts = Utils.splitOptions(mapTaskOptsS); String arffHeaderFileName = Utils.getOption("arff-header", taskOpts); totalFolds = Utils.getOption("total-folds", taskOpts); seed = Utils.getOption("seed", taskOpts); separateTestSet = Utils.getOption("test-set-path", taskOpts); trainingHeader = WekaClassifierHadoopMapper.loadTrainingHeader(arffHeaderFileName); headerWithoutSummaryAtts = CSVToARFFHeaderReduceTask.stripSummaryAtts(trainingHeader); WekaClassifierHadoopMapper.setClassIndex(taskOpts, headerWithoutSummaryAtts, true); Attribute classAtt = headerWithoutSummaryAtts.classAttribute(); if (classAtt == null) { throw new Exception("Class attribute is null!!"); } // now look for the summary stats att with this name so that we can set up // priors for evaluation properly String classAttSummaryName = CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX + classAtt.name(); Attribute summaryClassAtt = trainingHeader.attribute(classAttSummaryName); if (summaryClassAtt == null) { throw new Exception("WekaClassifierEvaluationHadoopReducer - was unable to find " + "the summary meta data attribute for the class attribute in the header"); } } catch (Exception ex) { throw new IOException(ex); } List<Evaluation> evalsToAgg = new ArrayList<Evaluation>(); try { for (BytesWritable b : values) { byte[] bytes = b.getBytes(); evalsToAgg.add(deserialize(bytes)); } } catch (Exception ex) { throw new IOException(ex); } int totalFoldsI = 1; if (!DistributedJobConfig.isEmpty(totalFolds)) { try { totalFoldsI = Integer.parseInt(totalFolds); } catch (NumberFormatException ex) { } } try { Evaluation aggregated = m_task.aggregate(evalsToAgg); Text outkey = new Text(); String info = "Summary - "; if (!DistributedJobConfig.isEmpty(separateTestSet)) { info += "separate test set"; } else if (totalFoldsI == 1) { info += "test on training"; } else { info += totalFolds + " fold cross-validation (seed=" + seed + ")\n(note: relative measures might be slightly " + "pessimistic due to the mean/mode of the target being computed on " + "all the data rather than on training folds)"; } info += ":\n"; if (aggregated.predictions() != null) { info += "Number of predictions retained for computing AUC/AUPRC: " + aggregated.predictions().size() + "\n"; } outkey.set(info); Text outVal = new Text(); outVal.set(aggregated.toSummaryString() + "\n"); context.write(outkey, outVal); outVal = new Text(); if (aggregated.getHeader().classAttribute().isNominal()) { outVal.set(aggregated.toClassDetailsString() + "\n"); context.write(null, outVal); outVal = new Text(); outVal.set(aggregated.toMatrixString() + "\n"); context.write(null, outVal); } // convert the evaluation into an ARFF or CSV file String outputDir = context.getConfiguration().get("mapred.output.dir"); if (DistributedJobConfig.isEmpty(outputDir)) { throw new Exception("WekaClassifierEvaluationReducer - unable to get the output directory " + "for some reason!"); } writeEvalAsStructured(aggregated, outputDir, context.getConfiguration()); } catch (Exception ex) { throw new IOException(ex); } }
From source file:weka.distributed.hadoop.WekaClassifierHadoopReducer.java
License:Open Source License
@Override public void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException { Configuration conf = context.getConfiguration(); String outputDestination = conf.get(CLASSIFIER_WRITE_PATH); if (outputDestination == null || outputDestination.length() == 0) { throw new IOException("No destination given for aggregated classifier"); }/*from www . ja va 2s . c o m*/ String minTrainingFrac = conf.get(MIN_TRAINING_FRACTION); if (!DistributedJobConfig.isEmpty(minTrainingFrac)) { double frac = Double.parseDouble(minTrainingFrac); if (frac > 1) { frac /= 100.0; } m_task.setMinTrainingFraction(frac); } String mapTaskOpts = conf.get(WekaClassifierHadoopMapper.CLASSIFIER_MAP_TASK_OPTIONS); boolean forceVote = false; try { forceVote = Utils.getFlag("force-vote", Utils.splitOptions(mapTaskOpts)); } catch (Exception e) { throw new IOException(e); } List<Classifier> classifiersToAgg = new ArrayList<Classifier>(); List<Integer> numTrainingInstancesPerClassifier = new ArrayList<Integer>(); try { for (BytesWritable b : values) { byte[] bytes = b.getBytes(); List<Object> info = deserialize(bytes); classifiersToAgg.add((Classifier) info.get(0)); numTrainingInstancesPerClassifier.add((Integer) info.get(1)); } } catch (ClassNotFoundException ex) { throw new IOException(ex); } try { Classifier aggregated = m_task.aggregate(classifiersToAgg, numTrainingInstancesPerClassifier, forceVote); int numAggregated = classifiersToAgg.size(); classifiersToAgg = null; // save memory System.gc(); Runtime currR = Runtime.getRuntime(); long freeM = currR.freeMemory(); long totalM = currR.totalMemory(); long maxM = currR.maxMemory(); System.err.println("[WekaClassifierHadoopReducer] Memory (free/total/max.) in bytes: " + String.format("%,d", freeM) + " / " + String.format("%,d", totalM) + " / " + String.format("%,d", maxM)); writeClassifierToDestination(aggregated, outputDestination, conf); Text outkey = new Text(); outkey.set("Summary:\n"); Text outVal = new Text(); StringBuffer buff = new StringBuffer(); buff.append("Number of training instances processed by each classifier: "); for (Integer i : numTrainingInstancesPerClassifier) { buff.append(i).append(" "); } if (m_task.getDiscarded().size() > 0) { buff.append("\nThere was one classifier not aggregated because it " + "had seen less than " + m_task.getMinTrainingFraction() * 100.0 + "% of amount of data (" + m_task.getDiscarded().get(0) + " instances) that the others had\n"); } outVal.set("Number of classifiers aggregated: " + numAggregated + ". Final classifier is a " + aggregated.getClass().getName() + "\n" + buff.toString()); context.write(outkey, outVal); if (!m_suppressAggregatedClassifierTextualOutput) { outkey.set("Aggregated model:\n"); outVal.set(aggregated.toString()); context.write(outkey, outVal); } } catch (Exception ex) { throw new IOException(ex); } }
From source file:weka.distributed.hadoop.WekaFoldBasedClassifierHadoopReducer.java
License:Open Source License
@Override public void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException { Configuration conf = context.getConfiguration(); String outputDestination = conf.get(CLASSIFIER_WRITE_PATH); if (outputDestination == null || outputDestination.length() == 0) { throw new IOException("No destination given for aggregated classifier"); }/*from w ww .j ava2 s . c om*/ // need to prepend the fold number so that the evaluation phase // (or the next iteration of incremental learning) can load // the appropriate aggregated classifier for the fold being // considered String foldString = key.toString(); foldString = foldString.substring(foldString.lastIndexOf("_") + 1, foldString.length()); int fold = -1; try { fold = Integer.parseInt(foldString.trim()); } catch (NumberFormatException n) { throw new IOException(n); } String modelNameOnly = outputDestination.substring(outputDestination.lastIndexOf("/") + 1, outputDestination.length()); outputDestination = outputDestination.substring(0, outputDestination.lastIndexOf("/") + 1); outputDestination += ("" + fold + "_" + modelNameOnly); String minTrainingFrac = conf.get(MIN_TRAINING_FRACTION); if (minTrainingFrac != null && minTrainingFrac.length() > 0) { double frac = Double.parseDouble(minTrainingFrac); if (frac > 1) { frac /= 100.0; } m_task.setMinTrainingFraction(frac); } String mapTaskOpts = conf.get(WekaClassifierHadoopMapper.CLASSIFIER_MAP_TASK_OPTIONS); boolean forceVote = false; try { forceVote = Utils.getFlag("force-vote", Utils.splitOptions(mapTaskOpts)); } catch (Exception e) { throw new IOException(e); } List<Classifier> classifiersToAgg = new ArrayList<Classifier>(); List<Integer> numTrainingInstancesPerClassifier = new ArrayList<Integer>(); try { for (BytesWritable b : values) { byte[] bytes = b.getBytes(); List<Object> info = deserialize(bytes); classifiersToAgg.add((Classifier) info.get(0)); numTrainingInstancesPerClassifier.add((Integer) info.get(1)); } } catch (Exception ex) { throw new IOException(ex); } try { Classifier aggregated = m_task.aggregate(classifiersToAgg, numTrainingInstancesPerClassifier, forceVote); writeClassifierToDestination(aggregated, outputDestination, conf); int numAggregated = classifiersToAgg.size(); classifiersToAgg = null; System.gc(); Runtime currR = Runtime.getRuntime(); long freeM = currR.freeMemory(); long totalM = currR.totalMemory(); long maxM = currR.maxMemory(); System.err.println("[WekaClassifierHadoopReducer] Memory (free/total/max.) in bytes: " + String.format("%,d", freeM) + " / " + String.format("%,d", totalM) + " / " + String.format("%,d", maxM)); Text outkey = new Text(); outkey.set("Summary for fold number " + fold + ":\n"); Text outVal = new Text(); StringBuffer buff = new StringBuffer(); buff.append("Number of training instances processed by each classifier: "); for (Integer i : numTrainingInstancesPerClassifier) { buff.append(i).append(" "); } if (m_task.getDiscarded().size() > 0) { buff.append("\nThere was one classifier not aggregated because it " + "had seen less than " + m_task.getMinTrainingFraction() * 100.0 + "% of amount of data (" + m_task.getDiscarded().get(0) + " instances) that the others had\n"); } outVal.set("Number of classifiers aggregated: " + numAggregated + ". Final classifier is a " + aggregated.getClass().getName() + "\n" + buff.toString()); context.write(outkey, outVal); if (!m_suppressAggregatedClassifierTextualOutput) { outkey.set("Aggregated model for fold number " + fold + ":\n"); outVal.set(aggregated.toString()); context.write(outkey, outVal); } } catch (Exception ex) { throw new IOException(ex); } }