Example usage for org.apache.hadoop.io BytesWritable getBytes

List of usage examples for org.apache.hadoop.io BytesWritable getBytes

Introduction

In this page you can find the example usage for org.apache.hadoop.io BytesWritable getBytes.

Prototype

@Override
public byte[] getBytes() 

Source Link

Document

Get the data backing the BytesWritable.

Usage

From source file:weka.distributed.hadoop.CSVToArffHeaderHadoopReducer.java

License:Open Source License

@Override
public void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException {
    Configuration conf = context.getConfiguration();
    String outputDestination = conf.get(CSV_TO_ARFF_HEADER_WRITE_PATH);

    if (outputDestination == null || outputDestination.length() == 0) {
        throw new IOException("No destination given for aggregated ARFF header");
    }//from  w w w.j  a v a2 s . c o m

    List<Instances> headersToAgg = new ArrayList<Instances>();
    List<HeaderAndQuantileDataHolder> holdersToAgg = new ArrayList<HeaderAndQuantileDataHolder>();

    int counter = 0;
    try {
        for (BytesWritable b : values) {
            byte[] bytes = b.getBytes();
            if (m_estimateQuantiles) {
                HeaderAndQuantileDataHolder holder = deserializeHolder(bytes);
                holdersToAgg.add(holder);
            } else {
                Instances aHeader = deserializeHeader(bytes);
                headersToAgg.add(aHeader);
            }
            counter++;
        }
    } catch (Exception ex) {
        throw new IOException(ex);
    }

    try {
        Instances aggregated = m_estimateQuantiles ? m_task.aggregateHeadersAndQuartiles(holdersToAgg)
                : m_task.aggregate(headersToAgg);
        writeHeaderToDestination(aggregated, outputDestination, conf);

        Text outkey = new Text();
        outkey.set("AKey");
        Text outval = new Text();
        outval.set("Num headers aggregated " + counter);
        context.write(outkey, outval);

    } catch (Exception e) {
        throw new IOException(e);
    }
}

From source file:weka.distributed.hadoop.KMeansCentroidSketchHadoopReducer.java

License:Open Source License

@Override
public void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException {

    int runNum = 0;
    String rS = key.toString();// w w w. ja  v  a 2 s . com
    rS = rS.replace("run", "");
    try {
        runNum = Integer.parseInt(rS);
    } catch (NumberFormatException ex) {
        throw new IOException(ex);
    }
    CentroidSketch initial = null;

    List<NormalizableDistance> distsForRun = new ArrayList<NormalizableDistance>();
    try {
        for (BytesWritable b : values) {
            byte[] bytes = b.getBytes();

            CentroidSketch current = deserialize(bytes);
            if (initial == null) {
                initial = current;
            } else {
                initial.aggregateReservoir(current.getReservoirSample());
            }

            if (m_isFirstIteration) {
                distsForRun.add(current.getDistanceFunction());
            }
        }

        // add the reservoir to the current sketch
        initial.addReservoirToCurrentSketch();

        // update the distance function with global numeric
        // attribute ranges
        if (m_isFirstIteration) {
            Instances distancePrimingData = KMeansReduceTask
                    .computeDistancePrimingDataFromDistanceFunctions(distsForRun, m_transformedHeaderNoSummary);

            initial.getDistanceFunction().setInstances(distancePrimingData);
        }

        // save the sketch out
        writeSketchToDestination(initial, m_outputDestination, runNum, context.getConfiguration());

        System.err.println("Number of instances in sketch for run " + runNum + ": "
                + initial.getCurrentSketch().numInstances());
        Text outKey = new Text();
        outKey.set("Summary:\n");
        Text outVal = new Text();
        outVal.set("Number of instances in sketch for run " + runNum + ": "
                + initial.getCurrentSketch().numInstances());

        context.write(outKey, outVal);
    } catch (Exception ex) {
        throw new IOException(ex);
    }
}

From source file:weka.distributed.hadoop.KMeansHadoopReducer.java

License:Open Source License

@Override
public void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException {

    int runNum = 0;
    String rS = key.toString();/*from  ww  w . j  a  va2  s  .  c  o m*/
    rS = rS.replace("run", "");
    try {
        runNum = Integer.parseInt(rS);
    } catch (NumberFormatException ex) {
        throw new IOException(ex);
    }

    Instances transformedHeaderNoSummary = null;
    List<List<Instances>> partialClusterSummariesForRun = new ArrayList<List<Instances>>();

    try {
        for (BytesWritable b : values) {
            byte[] bytes = b.getBytes();
            KMeansMapTask current = deserialize(bytes);
            if (transformedHeaderNoSummary == null) {
                transformedHeaderNoSummary = current.getTransformedHeader();
            }

            partialClusterSummariesForRun.add(current.getCentroidStats());
        }
    } catch (Exception ex) {
        throw new IOException(ex);
    }

    if (transformedHeaderNoSummary != null && partialClusterSummariesForRun.size() > 0) {
        KMeansReduceTask reducer = new KMeansReduceTask();

        try {
            reducer = reducer.reduceClusters(runNum, m_iterationNumber, transformedHeaderNoSummary,
                    partialClusterSummariesForRun);

            writeReduceTaskToDestination(reducer, m_outputDestination, runNum, context.getConfiguration());

            System.err.println("Wrote reducer for run: " + runNum + ". Total within clust err: "
                    + reducer.getTotalWithinClustersError());
        } catch (DistributedWekaException e) {
            throw new IOException(e);
        }
    } else {
        if (transformedHeaderNoSummary == null) {
            throw new IOException("Was unable to get the transformed header from the KMeansMapTasks!");
        }
        if (partialClusterSummariesForRun.size() == 0) {
            throw new IOException("There were no custer summaries to aggregate!");
        }
    }
}

From source file:weka.distributed.hadoop.WekaClassifierEvaluationHadoopReducer.java

License:Open Source License

@Override
public void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException {

    Configuration conf = context.getConfiguration();
    String mapTaskOptsS = conf.get(WekaClassifierHadoopMapper.CLASSIFIER_MAP_TASK_OPTIONS);

    Instances trainingHeader = null;/*from  w  w  w .ja v  a  2 s  .com*/
    Instances headerWithoutSummaryAtts = null;
    // double[] priors = null;
    // double priorsCount = 0;
    String totalFolds = "";
    String seed = "1";
    String separateTestSet = "";
    try {
        String[] taskOpts = Utils.splitOptions(mapTaskOptsS);
        String arffHeaderFileName = Utils.getOption("arff-header", taskOpts);
        totalFolds = Utils.getOption("total-folds", taskOpts);
        seed = Utils.getOption("seed", taskOpts);
        separateTestSet = Utils.getOption("test-set-path", taskOpts);

        trainingHeader = WekaClassifierHadoopMapper.loadTrainingHeader(arffHeaderFileName);

        headerWithoutSummaryAtts = CSVToARFFHeaderReduceTask.stripSummaryAtts(trainingHeader);
        WekaClassifierHadoopMapper.setClassIndex(taskOpts, headerWithoutSummaryAtts, true);

        Attribute classAtt = headerWithoutSummaryAtts.classAttribute();
        if (classAtt == null) {
            throw new Exception("Class attribute is null!!");
        }

        // now look for the summary stats att with this name so that we can set up
        // priors for evaluation properly
        String classAttSummaryName = CSVToARFFHeaderMapTask.ARFF_SUMMARY_ATTRIBUTE_PREFIX + classAtt.name();
        Attribute summaryClassAtt = trainingHeader.attribute(classAttSummaryName);

        if (summaryClassAtt == null) {
            throw new Exception("WekaClassifierEvaluationHadoopReducer - was unable to find "
                    + "the summary meta data attribute for the class attribute in the header");
        }
    } catch (Exception ex) {
        throw new IOException(ex);
    }

    List<Evaluation> evalsToAgg = new ArrayList<Evaluation>();

    try {
        for (BytesWritable b : values) {
            byte[] bytes = b.getBytes();
            evalsToAgg.add(deserialize(bytes));
        }
    } catch (Exception ex) {
        throw new IOException(ex);
    }

    int totalFoldsI = 1;
    if (!DistributedJobConfig.isEmpty(totalFolds)) {
        try {
            totalFoldsI = Integer.parseInt(totalFolds);
        } catch (NumberFormatException ex) {
        }
    }
    try {
        Evaluation aggregated = m_task.aggregate(evalsToAgg);

        Text outkey = new Text();
        String info = "Summary - ";
        if (!DistributedJobConfig.isEmpty(separateTestSet)) {
            info += "separate test set";
        } else if (totalFoldsI == 1) {
            info += "test on training";
        } else {
            info += totalFolds + " fold cross-validation (seed=" + seed
                    + ")\n(note: relative measures might be slightly "
                    + "pessimistic due to the mean/mode of the target being computed on "
                    + "all the data rather than on training folds)";
        }
        info += ":\n";
        if (aggregated.predictions() != null) {
            info += "Number of predictions retained for computing AUC/AUPRC: " + aggregated.predictions().size()
                    + "\n";
        }
        outkey.set(info);
        Text outVal = new Text();
        outVal.set(aggregated.toSummaryString() + "\n");
        context.write(outkey, outVal);

        outVal = new Text();
        if (aggregated.getHeader().classAttribute().isNominal()) {
            outVal.set(aggregated.toClassDetailsString() + "\n");
            context.write(null, outVal);

            outVal = new Text();
            outVal.set(aggregated.toMatrixString() + "\n");
            context.write(null, outVal);
        }

        // convert the evaluation into an ARFF or CSV file
        String outputDir = context.getConfiguration().get("mapred.output.dir");
        if (DistributedJobConfig.isEmpty(outputDir)) {
            throw new Exception("WekaClassifierEvaluationReducer - unable to get the output directory "
                    + "for some reason!");
        }

        writeEvalAsStructured(aggregated, outputDir, context.getConfiguration());
    } catch (Exception ex) {
        throw new IOException(ex);
    }
}

From source file:weka.distributed.hadoop.WekaClassifierHadoopReducer.java

License:Open Source License

@Override
public void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException {

    Configuration conf = context.getConfiguration();

    String outputDestination = conf.get(CLASSIFIER_WRITE_PATH);

    if (outputDestination == null || outputDestination.length() == 0) {
        throw new IOException("No destination given for aggregated classifier");
    }/*from  www  .  ja va  2s  .  c o  m*/

    String minTrainingFrac = conf.get(MIN_TRAINING_FRACTION);
    if (!DistributedJobConfig.isEmpty(minTrainingFrac)) {
        double frac = Double.parseDouble(minTrainingFrac);
        if (frac > 1) {
            frac /= 100.0;
        }
        m_task.setMinTrainingFraction(frac);
    }

    String mapTaskOpts = conf.get(WekaClassifierHadoopMapper.CLASSIFIER_MAP_TASK_OPTIONS);
    boolean forceVote = false;
    try {
        forceVote = Utils.getFlag("force-vote", Utils.splitOptions(mapTaskOpts));
    } catch (Exception e) {
        throw new IOException(e);
    }

    List<Classifier> classifiersToAgg = new ArrayList<Classifier>();
    List<Integer> numTrainingInstancesPerClassifier = new ArrayList<Integer>();

    try {
        for (BytesWritable b : values) {
            byte[] bytes = b.getBytes();

            List<Object> info = deserialize(bytes);
            classifiersToAgg.add((Classifier) info.get(0));
            numTrainingInstancesPerClassifier.add((Integer) info.get(1));
        }
    } catch (ClassNotFoundException ex) {
        throw new IOException(ex);
    }

    try {
        Classifier aggregated = m_task.aggregate(classifiersToAgg, numTrainingInstancesPerClassifier,
                forceVote);

        int numAggregated = classifiersToAgg.size();
        classifiersToAgg = null; // save memory
        System.gc();
        Runtime currR = Runtime.getRuntime();
        long freeM = currR.freeMemory();
        long totalM = currR.totalMemory();
        long maxM = currR.maxMemory();
        System.err.println("[WekaClassifierHadoopReducer] Memory (free/total/max.) in bytes: "
                + String.format("%,d", freeM) + " / " + String.format("%,d", totalM) + " / "
                + String.format("%,d", maxM));

        writeClassifierToDestination(aggregated, outputDestination, conf);

        Text outkey = new Text();
        outkey.set("Summary:\n");
        Text outVal = new Text();
        StringBuffer buff = new StringBuffer();
        buff.append("Number of training instances processed by each classifier: ");
        for (Integer i : numTrainingInstancesPerClassifier) {
            buff.append(i).append(" ");
        }
        if (m_task.getDiscarded().size() > 0) {
            buff.append("\nThere was one classifier not aggregated because it " + "had seen less than "
                    + m_task.getMinTrainingFraction() * 100.0 + "% of amount of data ("
                    + m_task.getDiscarded().get(0) + " instances) that the others had\n");
        }
        outVal.set("Number of classifiers aggregated: " + numAggregated + ". Final classifier is a "
                + aggregated.getClass().getName() + "\n" + buff.toString());
        context.write(outkey, outVal);

        if (!m_suppressAggregatedClassifierTextualOutput) {
            outkey.set("Aggregated model:\n");
            outVal.set(aggregated.toString());
            context.write(outkey, outVal);
        }
    } catch (Exception ex) {
        throw new IOException(ex);
    }
}

From source file:weka.distributed.hadoop.WekaFoldBasedClassifierHadoopReducer.java

License:Open Source License

@Override
public void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException {

    Configuration conf = context.getConfiguration();

    String outputDestination = conf.get(CLASSIFIER_WRITE_PATH);

    if (outputDestination == null || outputDestination.length() == 0) {
        throw new IOException("No destination given for aggregated classifier");
    }/*from w ww  .j ava2  s  . c om*/

    // need to prepend the fold number so that the evaluation phase
    // (or the next iteration of incremental learning) can load
    // the appropriate aggregated classifier for the fold being
    // considered
    String foldString = key.toString();
    foldString = foldString.substring(foldString.lastIndexOf("_") + 1, foldString.length());
    int fold = -1;
    try {
        fold = Integer.parseInt(foldString.trim());
    } catch (NumberFormatException n) {
        throw new IOException(n);
    }

    String modelNameOnly = outputDestination.substring(outputDestination.lastIndexOf("/") + 1,
            outputDestination.length());
    outputDestination = outputDestination.substring(0, outputDestination.lastIndexOf("/") + 1);
    outputDestination += ("" + fold + "_" + modelNameOnly);

    String minTrainingFrac = conf.get(MIN_TRAINING_FRACTION);
    if (minTrainingFrac != null && minTrainingFrac.length() > 0) {
        double frac = Double.parseDouble(minTrainingFrac);
        if (frac > 1) {
            frac /= 100.0;
        }
        m_task.setMinTrainingFraction(frac);
    }

    String mapTaskOpts = conf.get(WekaClassifierHadoopMapper.CLASSIFIER_MAP_TASK_OPTIONS);
    boolean forceVote = false;
    try {
        forceVote = Utils.getFlag("force-vote", Utils.splitOptions(mapTaskOpts));
    } catch (Exception e) {
        throw new IOException(e);
    }

    List<Classifier> classifiersToAgg = new ArrayList<Classifier>();
    List<Integer> numTrainingInstancesPerClassifier = new ArrayList<Integer>();

    try {
        for (BytesWritable b : values) {
            byte[] bytes = b.getBytes();

            List<Object> info = deserialize(bytes);
            classifiersToAgg.add((Classifier) info.get(0));
            numTrainingInstancesPerClassifier.add((Integer) info.get(1));
        }
    } catch (Exception ex) {
        throw new IOException(ex);
    }

    try {
        Classifier aggregated = m_task.aggregate(classifiersToAgg, numTrainingInstancesPerClassifier,
                forceVote);
        writeClassifierToDestination(aggregated, outputDestination, conf);

        int numAggregated = classifiersToAgg.size();
        classifiersToAgg = null;
        System.gc();
        Runtime currR = Runtime.getRuntime();
        long freeM = currR.freeMemory();
        long totalM = currR.totalMemory();
        long maxM = currR.maxMemory();
        System.err.println("[WekaClassifierHadoopReducer] Memory (free/total/max.) in bytes: "
                + String.format("%,d", freeM) + " / " + String.format("%,d", totalM) + " / "
                + String.format("%,d", maxM));

        Text outkey = new Text();
        outkey.set("Summary for fold number " + fold + ":\n");
        Text outVal = new Text();
        StringBuffer buff = new StringBuffer();
        buff.append("Number of training instances processed by each classifier: ");
        for (Integer i : numTrainingInstancesPerClassifier) {
            buff.append(i).append(" ");
        }
        if (m_task.getDiscarded().size() > 0) {
            buff.append("\nThere was one classifier not aggregated because it " + "had seen less than "
                    + m_task.getMinTrainingFraction() * 100.0 + "% of amount of data ("
                    + m_task.getDiscarded().get(0) + " instances) that the others had\n");
        }
        outVal.set("Number of classifiers aggregated: " + numAggregated + ". Final classifier is a "
                + aggregated.getClass().getName() + "\n" + buff.toString());
        context.write(outkey, outVal);

        if (!m_suppressAggregatedClassifierTextualOutput) {
            outkey.set("Aggregated model for fold number " + fold + ":\n");
            outVal.set(aggregated.toString());
            context.write(outkey, outVal);
        }
    } catch (Exception ex) {
        throw new IOException(ex);
    }
}