Example usage for org.apache.hadoop.io NullWritable get

List of usage examples for org.apache.hadoop.io NullWritable get

Introduction

In this page you can find the example usage for org.apache.hadoop.io NullWritable get.

Prototype

public static NullWritable get() 

Source Link

Document

Returns the single instance of this class.

Usage

From source file:ml.shifu.shifu.core.binning.UpdateBinningInfoReducer.java

License:Apache License

@Override
protected void reduce(IntWritable key, Iterable<BinningInfoWritable> values, Context context)
        throws IOException, InterruptedException {
    long start = System.currentTimeMillis();
    double sum = 0d;
    double squaredSum = 0d;
    double tripleSum = 0d;
    double quarticSum = 0d;

    long count = 0L, missingCount = 0L;
    double min = Double.MAX_VALUE, max = Double.MIN_VALUE;
    List<Double> binBoundaryList = null;
    List<String> binCategories = null;
    long[] binCountPos = null;
    long[] binCountNeg = null;
    double[] binWeightPos = null;
    double[] binWeightNeg = null;

    ColumnConfig columnConfig = this.columnConfigList.get(key.get());

    int binSize = 0;
    for (BinningInfoWritable info : values) {
        if (info.isNumeric() && binBoundaryList == null) {
            binBoundaryList = info.getBinBoundaries();
            binSize = binBoundaryList.size();
            binCountPos = new long[binSize + 1];
            binCountNeg = new long[binSize + 1];
            binWeightPos = new double[binSize + 1];
            binWeightNeg = new double[binSize + 1];
        }// ww w.j a  va  2s  .  c o  m
        if (!info.isNumeric() && binCategories == null) {
            binCategories = info.getBinCategories();
            binSize = binCategories.size();
            binCountPos = new long[binSize + 1];
            binCountNeg = new long[binSize + 1];
            binWeightPos = new double[binSize + 1];
            binWeightNeg = new double[binSize + 1];
        }
        count += info.getTotalCount();
        missingCount += info.getMissingCount();
        // for numeric, such sums are OK, for categorical, such values are all 0, should be updated by using
        // binCountPos and binCountNeg
        sum += info.getSum();
        squaredSum += info.getSquaredSum();
        tripleSum += info.getTripleSum();
        quarticSum += info.getQuarticSum();
        if (Double.compare(max, info.getMax()) < 0) {
            max = info.getMax();
        }

        if (Double.compare(min, info.getMin()) > 0) {
            min = info.getMin();
        }

        for (int i = 0; i < (binSize + 1); i++) {
            binCountPos[i] += info.getBinCountPos()[i];
            binCountNeg[i] += info.getBinCountNeg()[i];
            binWeightPos[i] += info.getBinWeightPos()[i];
            binWeightNeg[i] += info.getBinWeightNeg()[i];
        }
    }

    double[] binPosRate = computePosRate(binCountPos, binCountNeg);

    String binBounString = null;
    if (columnConfig.isCategorical()) {
        if (binCategories.size() == 0 || binCategories.size() > MAX_CATEGORICAL_BINC_COUNT) {
            LOG.warn("Column {} {} with invalid bin boundary size.", key.get(), columnConfig.getColumnName(),
                    binCategories.size());
            return;
        }
        binBounString = Base64Utils.base64Encode(
                "[" + StringUtils.join(binCategories, CalculateStatsUDF.CATEGORY_VAL_SEPARATOR) + "]");
        // recompute such value for categorical variables
        min = Double.MAX_VALUE;
        max = Double.MIN_VALUE;
        sum = 0d;
        squaredSum = 0d;
        for (int i = 0; i < binPosRate.length; i++) {
            if (!Double.isNaN(binPosRate[i])) {
                if (Double.compare(max, binPosRate[i]) < 0) {
                    max = binPosRate[i];
                }

                if (Double.compare(min, binPosRate[i]) > 0) {
                    min = binPosRate[i];
                }
                long binCount = binCountPos[i] + binCountNeg[i];
                sum += binPosRate[i] * binCount;
                double squaredVal = binPosRate[i] * binPosRate[i];
                squaredSum += squaredVal * binCount;
                tripleSum += squaredVal * binPosRate[i] * binCount;
                quarticSum += squaredVal * squaredVal * binCount;
            }
        }
    } else {
        if (binBoundaryList.size() == 0) {
            LOG.warn("Column {} {} with invalid bin boundary size.", key.get(), columnConfig.getColumnName(),
                    binBoundaryList.size());
            return;
        }
        binBounString = binBoundaryList.toString();
    }

    ColumnMetrics columnCountMetrics = ColumnStatsCalculator.calculateColumnMetrics(binCountNeg, binCountPos);
    ColumnMetrics columnWeightMetrics = ColumnStatsCalculator.calculateColumnMetrics(binWeightNeg,
            binWeightPos);

    // TODO & FIXME do we need validCount(totalCount - missingValueCount) for mean and stddev???
    double mean = sum / count;
    double stdDev = Math.sqrt(Math.abs((squaredSum - (sum * sum) / count + EPS) / (count - 1)));
    double aStdDev = Math.sqrt(Math.abs((squaredSum - (sum * sum) / count + EPS) / count));

    double skewness = ColumnStatsCalculator.computeSkewness(count, mean, aStdDev, sum, squaredSum, tripleSum);
    double kurtosis = ColumnStatsCalculator.computeKurtosis(count, mean, aStdDev, sum, squaredSum, tripleSum,
            quarticSum);

    sb.append(key.get()).append(Constants.DEFAULT_DELIMITER).append(binBounString)
            .append(Constants.DEFAULT_DELIMITER).append(Arrays.toString(binCountNeg))
            .append(Constants.DEFAULT_DELIMITER).append(Arrays.toString(binCountPos))
            .append(Constants.DEFAULT_DELIMITER).append(Arrays.toString(new double[0]))
            .append(Constants.DEFAULT_DELIMITER).append(Arrays.toString(binPosRate))
            .append(Constants.DEFAULT_DELIMITER).append(df.format(columnCountMetrics.getKs()))
            .append(Constants.DEFAULT_DELIMITER).append(df.format(columnWeightMetrics.getIv()))
            .append(Constants.DEFAULT_DELIMITER).append(df.format(max)).append(Constants.DEFAULT_DELIMITER)
            .append(df.format(min)).append(Constants.DEFAULT_DELIMITER).append(df.format(mean))
            .append(Constants.DEFAULT_DELIMITER).append(df.format(stdDev)).append(Constants.DEFAULT_DELIMITER)
            .append(columnConfig.isCategorical() ? "C" : "N").append(Constants.DEFAULT_DELIMITER)
            .append(df.format(mean)).append(Constants.DEFAULT_DELIMITER).append(missingCount)
            .append(Constants.DEFAULT_DELIMITER).append(count).append(Constants.DEFAULT_DELIMITER)
            .append(missingCount * 1.0d / count).append(Constants.DEFAULT_DELIMITER)
            .append(Arrays.toString(binWeightNeg)).append(Constants.DEFAULT_DELIMITER)
            .append(Arrays.toString(binWeightPos)).append(Constants.DEFAULT_DELIMITER)
            .append(columnCountMetrics.getWoe()).append(Constants.DEFAULT_DELIMITER)
            .append(columnWeightMetrics.getWoe()).append(Constants.DEFAULT_DELIMITER)
            .append(columnWeightMetrics.getKs()).append(Constants.DEFAULT_DELIMITER)
            .append(columnCountMetrics.getIv()).append(Constants.DEFAULT_DELIMITER)
            .append(columnCountMetrics.getBinningWoe().toString()).append(Constants.DEFAULT_DELIMITER)
            .append(columnWeightMetrics.getBinningWoe().toString()).append(Constants.DEFAULT_DELIMITER)
            .append(skewness).append(Constants.DEFAULT_DELIMITER).append(kurtosis);

    outputValue.set(sb.toString());
    context.write(NullWritable.get(), outputValue);
    sb.delete(0, sb.length());
    LOG.debug("Time:{}", (System.currentTimeMillis() - start));
}

From source file:ml.shifu.shifu.core.posttrain.PostTrainMapper.java

License:Apache License

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    String valueStr = value.toString();
    // StringUtils.isBlank is not used here to avoid import new jar
    if (valueStr == null || valueStr.length() == 0 || valueStr.trim().length() == 0) {
        LOG.warn("Empty input.");
        return;//from   w ww .j  a  va  2  s. co  m
    }

    if (!this.dataPurifier.isFilter(valueStr)) {
        return;
    }

    String[] units = CommonUtils.split(valueStr, this.modelConfig.getDataSetDelimiter());
    // tagColumnNum should be in units array, if not IndexOutofBoundException
    String tag = CommonUtils.trimTag(units[this.tagColumnNum]);

    if (!this.tags.contains(tag)) {
        if (System.currentTimeMillis() % 20 == 0) {
            LOG.warn("Data with invalid tag is ignored in post train, invalid tag: {}.", tag);
        }
        context.getCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG").increment(1L);
        return;
    }

    Map<String, String> rawDataMap = buildRawDataMap(units);
    CaseScoreResult csr = this.modelRunner.compute(rawDataMap);

    // store score value
    StringBuilder sb = new StringBuilder(500);
    sb.append(csr.getAvgScore()).append(Constants.DEFAULT_DELIMITER).append(csr.getMaxScore())
            .append(Constants.DEFAULT_DELIMITER).append(csr.getMinScore()).append(Constants.DEFAULT_DELIMITER);
    for (Double score : csr.getScores()) {
        sb.append(score).append(Constants.DEFAULT_DELIMITER);
    }
    List<String> metaList = modelConfig.getMetaColumnNames();
    for (String meta : metaList) {
        sb.append(rawDataMap.get(meta)).append(Constants.DEFAULT_DELIMITER);
    }
    sb.deleteCharAt(sb.length() - Constants.DEFAULT_DELIMITER.length());
    this.outputValue.set(sb.toString());
    this.mos.write(Constants.POST_TRAIN_OUTPUT_SCORE, NullWritable.get(), this.outputValue);

    for (int i = 0; i < headers.length; i++) {
        ColumnConfig config = this.columnConfigList.get(i);
        if (!config.isMeta() && !config.isTarget() && config.isFinalSelect()) {
            int binNum = BinUtils.getBinNum(config, units[i]);
            List<BinStats> feaureStatistics = this.variableStatsMap.get(config.getColumnNum());
            BinStats bs = null;
            if (binNum == -1) {
                // if -1, means invalid numeric value like null or empty, last one is for empty stats.
                bs = feaureStatistics.get(feaureStatistics.size() - 1);
            } else {
                bs = feaureStatistics.get(binNum);
            }
            // bs should not be null as already initialized in setup
            bs.setBinSum(csr.getAvgScore() + bs.getBinSum());
            bs.setBinCnt(1L + bs.getBinCnt());
        }
    }
}

From source file:net.jarcec.sqoop.data.gen.mr.GeneratorMapper.java

License:Apache License

@Override
protected void map(LongWritable key, LongWritable value, Context context)
        throws IOException, InterruptedException {
    long from = key.get();
    long to = value.get();

    random = new SecureRandom();
    decimal = new DecimalFormat("###.###");
    date = new SimpleDateFormat("yyyy-MM-dd");
    time = new SimpleDateFormat("HH:mm:ss");
    datetime = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");

    String[] types = context.getConfiguration().get(Constants.TYPES).split(",");
    String[] values = new String[types.length];

    for (long i = from; i < to; i++) {
        context.progress();//  www .ja va 2s  .c o m

        int y = 0;
        for (String type : types) {

            if ("id".equals(type)) {
                values[y] = String.valueOf(i);
            } else if ("s50".equals(type)) {
                values[y] = generateString(50);
            } else if ("i".equals(type)) {
                values[y] = generateInteger();
            } else if ("f".equals(type)) {
                values[y] = generateFloat(250, 31);
            } else if ("d".equals(type)) {
                values[y] = generateDate();
            } else if ("t".equals(type)) {
                values[y] = generateTime();
            } else if ("dt".equals(type)) {
                values[y] = generateDateTime();
            } else if ("s255".equals(type)) {
                values[y] = generateString(255);
            } else {
                throw new RuntimeException("Unknown type: " + type);
            }

            y++;
        }

        context.write(new Text(StringUtils.join(values, ",")), NullWritable.get());
    }
}

From source file:nl.bioinf.wvanhelvoirt.HadoopPhredCalculator.CombineReducer.java

License:Open Source License

/**
 * Override method that processes all mapper outputs to one array, ready to be written as file.
 *
 * @param key     NullWritable not used.
 * @param values  Iterable with AveragePhredCalculator items from each mapper.
 * @param context Context containing job information.
 * @throws IOException          When something went wrong.
 * @throws InterruptedException When connection was interrupted.
 *//*from ww  w  .j ava  2 s. c  o m*/
@Override
public void reduce(NullWritable key, Iterable<TextArrayWritable> values, Context context)
        throws IOException, InterruptedException {

    LinkedList<Float> sizablePhredArray = new LinkedList<>();
    LinkedList<Integer> sizableCountArray = new LinkedList<>();

    // For each Mapper output, add the values to photonCountMatrix.
    for (TextArrayWritable value : values) {
        Text[] asciiArray = value.get();

        // IntWritable arrays containing values, add these to the sizableArray only if not empty.
        if (asciiArray.length > 0) {
            for (int i = 0; i < asciiArray.length; i++) {
                try {
                    sizablePhredArray.set(i, sizablePhredArray.get(i)
                            + Float.parseFloat(asciiArray[i].toString().split("\\|")[0]));
                    sizableCountArray.set(i, sizableCountArray.get(i)
                            + Integer.parseInt(asciiArray[i].toString().split("\\|")[1]));
                } catch (IndexOutOfBoundsException e) {
                    sizablePhredArray.add(i, Float.parseFloat(asciiArray[i].toString().split("\\|")[0]));
                    sizableCountArray.add(i, Integer.parseInt(asciiArray[i].toString().split("\\|")[1]));
                }
            }
        }
    }

    // Instantiate the Text array and add lines.
    Text[] phredCount = new Text[(sizablePhredArray.size() + 1)];
    phredCount[0] = new Text("base_position\taverage_phred_score");
    for (int i = 0; i < sizablePhredArray.size(); i++) {
        phredCount[i + 1] = new Text((i + 1) + "\t" + (sizablePhredArray.get(i) / sizableCountArray.get(i)));
    }

    // Add the Text array to the ArrayWritable wrapper and return the result.
    context.write(NullWritable.get(), new TextArrayWritable(Text.class, phredCount));
}

From source file:nl.bioinf.wvanhelvoirt.HadoopPhredCalculator.ReadMapper.java

License:Open Source License

/**
 * Override method that processes one RecordReader item and send it's output to the reducing step.
 *
 * @param key     LongWritable as key./*  w w  w . j ava 2  s  .  c  o  m*/
 * @param value   Text containing reads (one read is 4 lines) from the fastq file.
 * @param context Context containing job information.
 * @throws IOException          When something went wrong.
 * @throws InterruptedException When connection was interrupted.
 */
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {

    // Set the configuration, read data and ascii base value.
    Configuration conf = context.getConfiguration();
    int asciiBase = conf.getInt("ascii.base", 64);
    String[] readData = value.toString().split("\\n");

    LinkedList<Float> sizablePhredArray = new LinkedList<>();
    LinkedList<Integer> sizableCountArray = new LinkedList<>();

    for (int i = 0; i < readData.length; i += 4) {

        // If the length of the base line equals the length of the phred line.
        if (readData[i + 1].length() == readData[i + 3].length()) {

            // Add the characters array to the IntWritable array.
            for (int j = 0; j < readData[i + 3].length(); j++) {
                try {
                    sizablePhredArray.set(j,
                            sizablePhredArray.get(j) + ((float) readData[i + 3].charAt(j) - asciiBase));
                    sizableCountArray.set(j, sizableCountArray.get(j) + 1);
                } catch (IndexOutOfBoundsException e) {
                    sizablePhredArray.add(j, ((float) readData[i + 3].charAt(j) - asciiBase));
                    sizableCountArray.add(j, 1);
                }
            }
        }
    }

    // Instantiate the Text array and add lines.
    Text[] phredCount = new Text[sizablePhredArray.size()];
    for (int i = 0; i < sizablePhredArray.size(); i++) {
        phredCount[i] = new Text(sizablePhredArray.get(i) + "|" + sizableCountArray.get(i));
    }

    // Add the IntWritable array too the TextArrayWritable wrapper and return the result.
    context.write(NullWritable.get(), new TextArrayWritable(Text.class, phredCount));
}

From source file:nl.gridline.free.taalserver.CountDocumentsReduce.java

License:Apache License

@Override
protected void reduce(LongWritable key, java.lang.Iterable<VarLongWritable> values, Context context)
        throws IOException, InterruptedException {
    int numberOfDocuments = 0;
    Iterator<VarLongWritable> i = values.iterator();
    while (i.hasNext()) // VarLongWritable writable : values)
    {/* w ww .j a  va  2s  .c o m*/
        numberOfDocuments++;
        i.next();
    }
    context.write(new VarIntWritable(numberOfDocuments), NullWritable.get());
    context.progress();
}

From source file:nl.gridline.free.taalserver.TFIdfSerializeMap.java

License:Apache License

@Override
protected void map(WordDocId key, TFIdfWritable value, Context context)
        throws IOException, InterruptedException {
    // userID,itemID,preference
    StringBuilder b = new StringBuilder().append(key.getWord().hashCode()).append(',').append(key.getDocId())
            .append(',').append(value.getTdIdf());

    out.set(b.toString());/*from   w  w w  .  j  a  v  a2  s.c  o  m*/
    context.write(out, NullWritable.get());
    context.progress();
}

From source file:nl.gridline.free.taalserver.TFIdfSerializeReduce.java

License:Apache License

@Override
protected void reduce(Text keyIn, Iterable<NullWritable> values, Context context)
        throws IOException, InterruptedException {
    // just put the values through:
    context.write(keyIn, NullWritable.get());
    context.progress();/*w  ww  .  java2s.c om*/
}

From source file:nl.gridline.zieook.inx.movielens.items.FilterDataReduce.java

License:Apache License

@Override
protected void reduce(Text key, Iterable<Text> values, Context context)
        throws java.io.IOException, InterruptedException {
    // key == itemID
    // value == [allowed,<userID,preference>]
    // output key = userID,itemID,preference

    // put values in a list:
    Set<String> valueSet = new HashSet<String>();
    String itemId = key.toString();
    for (Text s : values) {
        valueSet.add(s.toString());// w ww.  ja va  2  s  .c  o m
    }
    if (!shouldFilter || valueSet.contains("allowed")) {
        // write all but 'allowed'
        for (String s : valueSet) {
            if (!"allowed".equals(s)) {
                String[] data = s.split(",");
                if (data.length == 2) {
                    outKey.set(data[0] + "," + itemId + "," + data[1]);
                } else if (data.length == 1) {
                    outKey.set(data[0] + "," + itemId);
                } else {
                    LOG.error("Failed to read item, this is probably an error: value = '" + s + "' key = '"
                            + itemId + "'");
                }
                context.write(outKey, NullWritable.get());
            }
        }
    }
}

From source file:nl.sanoma.hdt.report.generator.ReportGeneratorReducerTest.java

License:Open Source License

/**
 * Test of reduce method, of class ReportGeneratorReducer.
 *//*  w w w. j av  a  2  s  .com*/
@Test
public void testReduce() throws Exception {
    System.out.println("reduce");
    KeyData key = new KeyData(1, "candy");
    Iterable<ValueData> values = Arrays.asList(new ValueData("Q1", "fruit", 3.0),
            new ValueData("Q2", "fruit", 1.0), new ValueData("Q1", "grocery", 2.0),
            new ValueData("Q3", "grocery", 2.0), new ValueData("Q4", "grocery", 2.0));
    Reducer.Context context = mock(ReportGeneratorReducer.Context.class);
    ;
    ReportGeneratorReducer instance = new ReportGeneratorReducer();
    instance.reduce(key, values, context);
    verify(context).write(NullWritable.get(), new Text("1\tgrocery\t\t\t5.0\t1.0\t2.0\t2.0"));

}